#######################################################################
#################### HEADER, CHANGE VALUES HERE #######################
#######################################################################
version="1.2.2"
read_sets="data_sample/reads_sequence1.fasta   data_sample/reads_sequence2.fasta.gz " # FOR instance: "read_set1.fa.gz read_set2.fq.gz"
prefix="test_ref" # all temp and final files will be written will start with this prefix
k=31 # size of kmers
b=0 # smart branching approach: bubbles in which both paths are equaly branching are  discarded, all others are accepted
c=4 # minimal coverage
d=1 # estimated number of error per read (used by kissreads only)
g=10000000 # estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM.
PATH_RS="./tools/" # path were executables kissnp2 and kissreads are. Leave blank if they are located in a directory located in the PATH environnement variable
#######################################################################
#################### END HEADER                 #######################
#######################################################################

function help {
echo "run_discoSnp.sh, a pipelining kissnp2 and kissreads for calling SNPs from NGS reads without the need of a reference genome"
echo "Version "$version
echo "Usage: ./run_discoSnp.sh OPT"
echo -e "\tOPT:"
echo -e "\t \t -r list of reads separated by space, surrounded by the '\"' character. Note that reads may be in fasta or fastq format, gzipped or not. Example: -r \"data_sample/reads_sequence1.fasta   data_sample/reads_sequence2.fasta.gz\"."
echo -e "\t\t -b value. "
echo -e "\t\t\t 0: forbid SNPs for wich any of the two paths is branching (high precision, lowers the recal in complex genomes). Default value"
echo -e "\t\t\t 1: (smart branching) forbid SNPs for wich the two paths are branching (e.g. the two paths can be created either with a 'A' or a 'C' at the same position"
echo -e "\t\t\t 2: No limitation on branching (lowers the precision, high recall)"
echo -e "\t\t -p prefix. All out files will start with this prefix. Example: -p my_prefix"
echo -e "\t\t -k value. Set the length of used kmers. Must fit the compiled value. Default=31. Example -k 31"
echo -e "\t\t -c value. Set the minimal coverage: Used by kissnp2 (don't use kmers with lower coverage) and kissreads (read coherency threshold). Default=4. Example -c 4"
echo -e "\t\t -d value. Set the number of authorized substitutions used while mapping reads on found SNPs (kissreads). Default=1. Example: -d 1"
echo -e "\t\t -g value. Estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM. Default=10 million. Example: -d 10000000"
echo -e "\t\t -h: Prints this message and exist"
echo "Any further question: read the readme file or contact us: pierre.peterlongo@inria.fr"
}


#######################################################################
#################### GET OPTIONS                #######################
#######################################################################
while getopts ":r:p:k:c:d:g:b:h" opt; do
case $opt in

h)
help
exit
;;

r)
echo "use read set: $OPTARG" >&2
read_sets=$OPTARG
;;

b)
echo "use branching strategy: $OPTARG" >&2
b=$OPTARG
;;

p)
echo "use prefix=$OPTARG" >&2
prefix=$OPTARG
;;

k)
echo "use k=$OPTARG" >&2
k=$OPTARG
;;

c)
echo "use c=$OPTARG" >&2
c=$OPTARG
;;

d)
echo "use d=$OPTARG" >&2
d=$OPTARG
;;

g)
echo "use g=$OPTARG" >&2
g=$OPTARG
;;

\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;

:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
#######################################################################
#################### END GET OPTIONS            #######################
#######################################################################




#######################################################################
#################### OPTIONS SUMMARY            #######################
#######################################################################
MY_PATH="`( cd \"$MY_PATH\" && pwd )`"  # absolutized and normalized
if [ -z "$MY_PATH" ] ; then
# error; for some reason, the path is not accessible
# to the script (e.g. permissions re-evaled after suid)
exit 1  # fail
fi
echo -e "\tRunning discoSnp "$version", in directory "$MY_PATH" with following parameters:"
echo -e "\t\t read_sets="$read_sets
echo -e "\t\t prefix="$prefix
echo -e "\t\t c="$c
echo -e "\t\t k="$k
echo -e "\t\t b="$b
echo -e "\t\t d="$d
echo -e "\t\t g="$g
echo -e -n "\t starting date="
date
echo
#######################################################################
#################### END OPTIONS SUMMARY        #######################
#######################################################################






######### CHECK THE k PARITY ##########
rest=$(( $k % 2 ))
if [ $rest -eq 0 ]
then
echo "k=$k is even number, to avoid palindromes, we set it to $(($k-1))"
k=$(($k-1))
fi
#######################################


#######################################################################
#################### KISSNP2                    #######################
#######################################################################
$PATH_RS\kissnp2 $read_sets -T -k $k -c $c  -g $g -o $prefix -l -b $b # see also options for kissnp with kissnp -h (filtration with max occurrence numbers (-C), filtration with respect to size of contig (-e), possibility to output only unitig (-t instead of -T), ...)

if [ $? -ne 0 ]
then
    echo "there was a problem with kissnp2, command line: $PATH_RS\kissnp2 $read_sets -t -k $k -c $c  -g $g -o $prefix"
    exit
fi


#######################################################################
#################### KISSREADS                  #######################
#######################################################################
smallk=$(($k-4)) # avoid modify this.
i=5 #avoid modidy this

$PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -c $c -d $d -n -i $i -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent #no need to see kissreads option in theory.
if [ $? -ne 0 ]
then
echo "there was a problem with kissnp2, command line: $PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -c $c -d $d -n -i $i -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent"
exit
fi
rm -f $prefix\_k_$k\_c_$c.fa #useless now.

#######################################################################
#################### SORT AND FORMAT COHERENT RESULTS #################
#######################################################################
sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa
if [ $? -ne 0 ]
then
echo "there was a problem with the result sorting, command line: sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa"
exit
fi

echo "discoSnp done, coherent SNPs are stored in \""$prefix\_k_$k\_c_$c\_coherent.fa"\""
echo -e -n "\t ending date="
date
echo -e "\t Thanks for using discoSnp - http://colibread.inria.fr/discoSnp/"


#/Users/grizk/gassst  -d test_k_31_c_4_coherent.fa  -i /Users/grizk/kissnp2/kissnp2_tool/snp_list_readsnp  -p 100 -w 15 -m 8 -l 0 -r 1 -o temp;

