diff --git a/HybPhyloMaker0a_preparedata.sh b/HybPhyloMaker0a_preparedata.sh index ae8913d..a33fd3a 100644 --- a/HybPhyloMaker0a_preparedata.sh +++ b/HybPhyloMaker0a_preparedata.sh @@ -20,9 +20,9 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * -# * Script 00a - Download & prepare data * -# * v.1.3.0 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * Script 0a - Download & prepare data * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker0b_preparereference.sh b/HybPhyloMaker0b_preparereference.sh index a56957d..28641fb 100644 --- a/HybPhyloMaker0b_preparereference.sh +++ b/HybPhyloMaker0b_preparereference.sh @@ -19,8 +19,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 0b - Prepare pseudoreference * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker0c_Rsetup_MetaCentrum.sh b/HybPhyloMaker0c_Rsetup_MetaCentrum.sh index bd14ccb..5f83473 100644 --- a/HybPhyloMaker0c_Rsetup_MetaCentrum.sh +++ b/HybPhyloMaker0c_Rsetup_MetaCentrum.sh @@ -9,9 +9,9 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * -# * Setup R packages * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * Script 0c - Setup R packages on Metacentrum * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker1_rawprocess.sh b/HybPhyloMaker1_rawprocess.sh index 725bb4a..4590b93 100644 --- a/HybPhyloMaker1_rawprocess.sh +++ b/HybPhyloMaker1_rawprocess.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=1d -#PBS -l nodes=1:ppn=1:debian8 +#PBS -l nodes=1:ppn=1:debian8:minspec=29 #PBS -j oe #PBS -l mem=4gb #PBS -l scratch=80gb @@ -20,8 +20,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 01 - Raw data processing * -# * v.1.3.3 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # * based on Weitemier et al. (2014), Applications in Plant Science 2(9): 1400042* # ******************************************************************************** diff --git a/HybPhyloMaker2_readmapping.sh b/HybPhyloMaker2_readmapping.sh index 266e0c5..2ebb080 100644 --- a/HybPhyloMaker2_readmapping.sh +++ b/HybPhyloMaker2_readmapping.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=2d -#PBS -l nodes=1:ppn=4 +#PBS -l nodes=1:ppn=4:minspec=29 #PBS -j oe #PBS -l mem=4gb #PBS -l scratch=80gb @@ -20,8 +20,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 02 - Read mapping using bowtie2 * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -43,6 +43,7 @@ if [[ $PBS_O_HOST == *".cz" ]]; then module add samtools-1.3 module add perl-5.10.1 module add gcc-4.8.4 + module add python34-modules-gcc module add ococo-2016-11 elif [[ $HOSTNAME == compute-*-*.local ]]; then echo -e "\nHybPhyloMaker2 is running on Hydra...\n" @@ -57,6 +58,7 @@ elif [[ $HOSTNAME == compute-*-*.local ]]; then #Add necessary modules module load bioinformatics/bowtie2/2.2.6 module load bioinformatics/samtools/1.3 + module load bioinformatics/ococo/ #??? else echo -e "\nHybPhyloMaker2 is running locally...\n" #settings for local run @@ -222,19 +224,24 @@ for file in $(cat SamplesFileNames.txt); do echo "Copying BAM..." cp $path/$type/21mapped/${file}.bam . fi - #CONSENSUS USING OCOCO - echo "Making consensus with OCOCO..." - ococo -i ${file}.bam -c $mincov -F ${file}.fasta 2>/dev/null + #CONSENSUS USING KINDEL/OCOCO + if [[ $conscall =~ "ococo" ]]; then + echo "Making consensus with OCOCO..." + ococo -i ${file}.bam -x ococo64 -c $mincov -F ${file}.fasta 2>/dev/null + else + echo "Making consensus with kindel..." + kindel -m $mincov -t $majthres ${file}.bam > ${file}.fasta + fi #change name in fasta file - sed -i '1d' ${file}.fasta #delete first line - sed -i "1i >$file" ${file}.fasta #insert fasta header as a first line + sed -i.bak '1d' ${file}.fasta #delete first line + sed -i.bak "1i >$file" ${file}.fasta #insert fasta header as a first line #Remove line breaks from fasta file awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' ${file}.fasta > tmp && mv tmp ${file}.fasta #put $nrns Ns to variable 'a' and $nrns ?s to variable 'b' a=$(printf "%0.sN" $(seq 1 $nrns)) b=$(printf "%0.s?" $(seq 1 $nrns)) #replace all Ns separating exons by '?' - sed -i "s/$a/$b/g" ${file}.fasta + sed -i.bak "s/$a/$b/g" ${file}.fasta #copy results to home cp ${file}.fasta $path/$type/21mapped #delete BAM diff --git a/HybPhyloMaker3_generatepslx.sh b/HybPhyloMaker3_generatepslx.sh index 7b2e478..9ad4e38 100644 --- a/HybPhyloMaker3_generatepslx.sh +++ b/HybPhyloMaker3_generatepslx.sh @@ -21,8 +21,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 03 - Process consensus after mapping, make pslx files * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # * based on Weitemier et al. (2014), Applications in Plant Science 2(9): 1400042* # ******************************************************************************** diff --git a/HybPhyloMaker4b_correctframe_translate.sh b/HybPhyloMaker4b_correctframe_translate.sh new file mode 100644 index 0000000..1393649 --- /dev/null +++ b/HybPhyloMaker4b_correctframe_translate.sh @@ -0,0 +1,278 @@ +#!/bin/bash +#PBS -l walltime=8h +#PBS -l nodes=1:ppn=4:minspec=29 +#PBS -j oe +#PBS -l mem=4gb +#PBS -l scratch=4gb +#PBS -N HybPhyloMaker4b_translate +#PBS -m abe + +# ******************************************************************************** +# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * +# * Script 04b - Put exons to correct reading frame and translate * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * +# * tomas.fer@natur.cuni.cz * +# ******************************************************************************** + + +#Take MAFFT exon alignment files, check for correct reading frame (+ remove incomplete triplets at the beginning and end of each exon) +#and translate to amino acids using EMBOSS +#Produce concatenation of +#(1) corrected exonic nucleotide alignments (+ generate 2 partition files - by exons and by codon positions within exons) +#(2) translated exons alignments + +#Complete path and set configuration for selected location +if [[ $PBS_O_HOST == *".cz" ]]; then + echo -e "\nHybPhyloMaker-translate is running on MetaCentrum..." + #settings for MetaCentrum + #Copy file with settings from home and set variables from settings.cfg + cp -f $PBS_O_WORKDIR/settings.cfg . + . settings.cfg + . /packages/run/modules-2.0/init/bash + path=/storage/$server/home/$LOGNAME/$data + source=/storage/$server/home/$LOGNAME/HybSeqSource + othersourcepath=/storage/$server/home/$LOGNAME/$othersource + otherpslxpath=/storage/$server/home/$LOGNAME/$otherpslx + #Move to scratch + cd $SCRATCHDIR + #Add necessary modules + module add emboss-6.5.7 + module add perl-5.10.1 + module add python-3.4.1-gcc +elif [[ $HOSTNAME == compute-*-*.local ]]; then + echo -e "\nHybPhyloMaker-translate is running on Hydra..." + #settings for Hydra + #set variables from settings.cfg + . settings.cfg + path=../$data + source=../HybSeqSource + othersourcepath=../$othersource + otherpslxpath=../$otherpslx + #Make and enter work directory + mkdir -p workdir04b + cd workdir04b + #Add necessary modules + module load bioinformatics/ +else + echo -e "\nHybPhyloMaker-translate is running locally..." + #settings for local run + #set variables from settings.cfg + . settings.cfg + path=../$data + source=../HybSeqSource + othersourcepath=../$othersource + otherpslxpath=../$otherpslx + #Make and enter work directory + mkdir -p workdir04b + cd workdir04b +fi +#Setting for the case when working with cpDNA +if [[ $cp =~ "yes" ]]; then + echo -e "Working with cpDNA\n" + type="cp" +else + echo -e "Working with exons\n" + type="exons" +fi + +#Copy scripts +cp $source/catfasta2phyml.pl . +cp $source/AMAS.py . +#Copy MAFFT exon alignments +cp $path/$type/60mafft/*.mafft . +#Make a dir for corrected exons and concatenated loci +mkdir $path/$type/61mafft_corrected +mkdir $path/$type/62mafft_translated +mkdir $path/$type/80concatenated_exon_alignments_corrected +mkdir $path/$type/90concatenated_exon_alignments_translated + +#Translate into frame 1, 2 and 3, count number of stop codons and save it to file +echo -ne "Translating exons to all three reading frames and selecting the correct one..." +ls *.mafft > listOfMAFFTFiles.txt +for mafftfile in $(cat listOfMAFFTFiles.txt) +do + #Replace gaps in sequences by 'n' + sed -i.bak '/>/!s/-/n/g' $mafftfile + echo $mafftfile >> ${mafftfile}_stopcodonnr_overview.txt + for i in 1 2 3 + do + transeq fasta::$mafftfile -frame $i stdout | fgrep -o \* | wc -l >> stopcodonnr_${mafftfile}.txt + done + cat stopcodonnr_${mafftfile}.txt >> ${mafftfile}_stopcodonnr_overview.txt + cp ${mafftfile}_stopcodonnr_overview.txt $path/$type/62mafft_translated + #Select the frame with least number of stop codons + min=$(sort -n stopcodonnr_${mafftfile}.txt | head -1) + frame=$(grep -n ^${min}$ stopcodonnr_${mafftfile}.txt | cut -c1 | head -1) + #Generate modified DNA alignment (introduce frame shift) + #Number of added character is $frame-1 (i.e., frame1=0, frame2=1, frame3=2) + if [ "$frame" = 2 ]; then + add=2 + elif [ "$frame" = 3 ]; then + add=1 + elif [ "$frame" = 1 ]; then + add=0 + fi + addcharacter=$(printf '%*s' $add '' | tr ' ' 'n') + #Add 'n's at the beginning of each nucleotide sequence (first remove line breaks from FASTA file - awk command, second add '???' after each line not beginning with '>' - sed command) + awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' ${mafftfile} | sed '/>/!s/^/'"$addcharacter"'/' > ${mafftfile}.corrframe + #Add '???' at the end of each nucleotide sequence (i.e., after each line not beginning with '>') + #sed -i '/>/!s/$/???/' ${mafftfile}.corrframe + #Remove first three characters from sequences in case of frame shift to remove incomplete triplet + if [ "$frame" != 1 ]; then + sed -i.bak '/>/!s/^...//' ${mafftfile}.corrframe + fi + #Remove incomplete triplets from the end of exon (0, 1 or 2 bases) + #Compute length of the alignment (length of the first accession, i.e., second line) + len=$(cat ${mafftfile}.corrframe | head -2 | tail -1 | awk '{ print length }') + remove=`expr $len % 3` #Calculate modulo after division by 3, i.e., number of bases to be remove` + #Remove last 0, 1 or 2 characters from sequences (i.e., from each line not beginning with '>') + sed -i.bak '/>/!s/.\{'"$remove"'\}$//' ${mafftfile}.corrframe + #Copy corrected exon alignment to home + cp ${mafftfile}.corrframe $path/$type/61mafft_corrected + + #Translate corrected exon + transeq fasta::$mafftfile.corrframe -frame 1 -outseq ${mafftfile}.translated + #remove line breaks from translated FASTA file + awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' ${mafftfile}.translated > tmp && mv tmp ${mafftfile}.translated + #Add '???' at the end of each protein sequence (add '???' after each line not beginning with '>' - sed command) + #sed -i.bak '/>/!s/$/???/' ${mafftfile}.translated + #Remove last two characters from fasta headers (i.e., each line beginning with '>') - were introduced by EMBOSS transeq + sed -i.bak '/>/s/.\{2\}$//' ${mafftfile}.translated + #Copy translated exon alignment to home + cp ${mafftfile}.translated $path/$type/62mafft_translated +done +echo -e "done\n" + +echo -en "Preparing summary table..." +#combine the information about stop codons by frame +paste *overview.txt > stop_codons_by_frame.txt +#transpose the matrix +awk ' +{ + for (i=1; i<=NF; i++) { + a[NR,i] = $i + } +} +NF>p { p = NF } +END { + for(j=1; j<=p; j++) { + str=a[1,j] + for(i=2; i<=NR; i++){ + str=str" "a[i,j]; + } + print str + } +}' stop_codons_by_frame.txt > stop_codons_by_frame_transposed.txt +awk '{print $1 }' stop_codons_by_frame_transposed.txt > names.txt +awk '{print $2 " " $3 " " $4 }' stop_codons_by_frame_transposed.txt > stop_codons_by_frame.txt +#put the lowest number as last column +awk '{min=$1;for(i=1;i<=NF;i++){if($i < min) min = $i} print $0 " " min}' stop_codons_by_frame.txt > tmp && mv tmp stop_codons_by_frame.txt +#put the number of zero stop codons as last column +awk '{zeros=0;for(i=1;i<=NF-1;i++){if($i=="0") zeros++ } print $0 " " zeros }' stop_codons_by_frame.txt > tmp && mv tmp stop_codons_by_frame.txt +#combine back with names +paste names.txt stop_codons_by_frame.txt > tmp && mv tmp stop_codons_by_frame.txt +cat stop_codons_by_frame.txt | tr " " "\t" > tmp && mv tmp stop_codons_by_frame.txt +echo -e "done\n" + +#Filter exons +echo -ne "Filtering exons..." +#1. Remove exons with more than one reading frame with 0 stop codons +awk '{if($6>1) print $1}' stop_codons_by_frame.txt > removed_more_than_1_possible_reading_frame.txt +awk '{if($6<=1) print $0}' stop_codons_by_frame.txt > selected_exons.txt +#2. Remove exons with lowest number of stop codons exceeding threshold (maxstop) +awk -v val=$maxstop '{if($5>val) print $1}' stop_codons_by_frame.txt > removed_lowest_number_of_stop_codons_exceeded_maxstop.txt +awk -v val=$maxstop '{if($5<=val) print $1}' selected_exons.txt > tmp && mv tmp selected_exons.txt +#Copy lists to home +cp selected_exons.txt $path/$type/62mafft_translated +cp removed*.txt $path/$type/62mafft_translated + +#Modify translation summary and copy to home +sed -i.bak 's/To_align_Assembly_//g' stop_codons_by_frame.txt +sed -i.bak2 's/.fasta.mafft//g' stop_codons_by_frame.txt +cp stop_codons_by_frame.txt $path/$type/62mafft_translated + +#Copy selected exons to folder +mkdir selected +mkdir selectedtranslated +for i in $(cat selected_exons.txt); do + cp ${i}.translated selectedtranslated + cp ${i}.corrframe selected +done + +#Modify selected exons (replace stopcodons by NNN) +for i in $(cat selected_exons.txt); do + sed 's/.../&+/g;s/+$//' selected/${i}.corrframe | sed 's/taa/NNN/g' | sed 's/tag/NNN/g' | sed 's/tga/NNN/g' | sed 's/+//g' > selected/tmp && mv selected/tmp selected/${i}.corrframe + cp selected/${i}.corrframe $path/$type/61mafft_corrected +done +echo -e "done\n" +#-----------------------CONCATENATE MODIFIED NUCLEOTIDE EXON ALIGNMENTS----------------------- +echo -ne "Concatenating corrected exons..." +#Modify mafft file names (from, i.e., To_align_Assembly_10372_Contig_1_516.fasta.mafft to To_align_Assembly_10372_*mafft) +#(all files starting with "To_align_Assembly_10372_" will be merged) +ls -1 selected/*.corrframe | cut -d'_' -f4 | sort -u | sed s/^/To_align_Assembly_/g | sed s/\$/_*mafft.corrframe/g > fileNamesForConcat.txt +#Modify mafft file names - prepare names for saving concatenate alignments (not possible to use names above as they contain '*'), e.g. Assembly_10372 +ls -1 selected/*.corrframe | cut -d'_' -f4 | sort -u | sed s/^/CorrectedAssembly_/g > fileNamesForSaving.txt +#Combine both files (make single file with two columns) +paste fileNamesForConcat.txt fileNamesForSaving.txt > fileForLoop.txt + +#Concatenate the exon alignments (values from first and second column of fileForLoop.txt are assigned to variable 'a' and 'b', respectively), +#transform fasta to phylip format, copy results from scratch to home +cat fileForLoop.txt | while read -r a b; do + python3 AMAS.py concat -i selected/$a -f fasta -d dna -u fasta -t selected/${b}.fasta >/dev/null + python3 AMAS.py concat -i selected/$a -f fasta -d dna -u phylip -t selected/${b}.phylip >/dev/null + #Modify partition file for partitioning by codon positions + # cat partitions.txt #take the file + # cut -d"_" -f2- #take 2nd and following parts of each line separated by "_" (remove 'p1_' etc. introduced by AMAS) + # awk '{ print $0 "\\3;" }' # + # awk '{for(i=0;i<3;i++)print}' #print each line 3-times + # awk -F '[=-]' '{ if (NR % 3 == 1) print $1 "_1=" $2 "-" $3; else if (NR % 3 == 2) print $1 "_2=" $2+1 "-" $3; else print $1 "_3=" $2+2 "-" $3}' + # #set separators to '=' and '-' and on 1st, 2nd and 3rd line do different modifications (add _1, _2 or _3; leave unchanged, add 1 or add 2 to $2, i.e. start position) + # sed 's/-/ - /g' #replace "-" by " - " + # sed 's/=/ = /g' #replace "=" by " = " + cat partitions.txt | cut -d"_" -f2- | awk '{ print $0 "\\3;" }' | awk '{for(i=0;i<3;i++)print}' | awk -F '[=-]' '{ if (NR % 3 == 1) print $1 "_1=" $2 "-" $3; else if (NR % 3 == 2) print $1 "_2=" $2+1 "-" $3; else print $1 "_3=" $2+2 "-" $3}' | sed 's/-/ - /g' | sed 's/=/ = /g' | sed -e 's/^/DNA, /g' -e 's/To_align_//g' | sed 's/;//g' > selected/${b}.codonpart.file + #modify and rename partitions.txt + cat partitions.txt | cut -d"_" -f2- | sed -e 's/^/DNA, /g' -e 's/To_align_//g' > selected/${b}.part + #perl catfasta2phyml.pl -f $a > $b.fasta + #perl catfasta2phyml.pl $b.fasta > $b.phylip + #Copy concatenated alignments and partition files to home + cp selected/$b.* $path/$type/80concatenated_exon_alignments_corrected +done +echo -e "done\n" + +#-----------------------CONCATENATE TRANSLATED EXON ALIGNMENTS----------------------- +echo -ne "Concatenating translated exons..." +#Modify mafft file names (from, i.e., To_align_Assembly_10372_Contig_1_516.fasta.mafft.translated to To_align_Assembly_10372_*mafft.translated) +#(all files starting with "To_align_Assembly_10372_" will be merged) +ls -1 selectedtranslated/*.translated | cut -d'_' -f4 | sort -u | sed s/^/To_align_Assembly_/g | sed s/\$/_*mafft.translated/g > fileNamesForConcat.txt +#Modify mafft file names - prepare names for saving concatenate alignments (not possible to use names above as they contain '*'), e.g. Assembly_10372 +ls -1 selectedtranslated/*.translated | cut -d'_' -f4 | sort -u | sed s/^/TranslatedAssembly_/g > fileNamesForSaving.txt +#Combine both files (make single file with two columns) +paste fileNamesForConcat.txt fileNamesForSaving.txt > fileForLoop.txt +#Concatenate the exon alignments (values from first and second column of fileForLoop.txt are assigned to variable 'a' and 'b', respectively), +#transform fasta to phylip format, copy results from scratch to home +cat fileForLoop.txt | while read -r a b; do + python3 AMAS.py concat -i selectedtranslated/$a -f fasta -d aa -u fasta -t selectedtranslated/${b}.fasta >/dev/null + python3 AMAS.py concat -i selectedtranslated/$a -f fasta -d aa -u phylip -t selectedtranslated/${b}.phylip >/dev/null + #modify and rename partitions.txt + sed -i.bak -e 's/^/WAG, /g' -e 's/To_align_//g' partitions.txt + mv partitions.txt selectedtranslated/${b}.part + #perl catfasta2phyml.pl -f $a > $b.fasta + #perl catfasta2phyml.pl $b.fasta > $b.phylip + #Copy translated concatenated alignments to home + cp selectedtranslated/$b.* $path/$type/90concatenated_exon_alignments_translated +done +echo "done\n" + +#Clean scratch/work directory +if [[ $PBS_O_HOST == *".cz" ]]; then + #delete scratch + if [[ ! $SCRATCHDIR == "" ]]; then + rm -rf $SCRATCHDIR/* + fi +else + cd .. + rm -r workdir04b +fi + +echo -e "\nScript HybPhyloMaker04b finished...\n" \ No newline at end of file diff --git a/HybPhyloMaker5_missingdataremoval.sh b/HybPhyloMaker5_missingdataremoval.sh index ba4a8d4..d584469 100644 --- a/HybPhyloMaker5_missingdataremoval.sh +++ b/HybPhyloMaker5_missingdataremoval.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=1d -#PBS -l nodes=1:ppn=4 +#PBS -l nodes=1:ppn=4:minspec=29 #PBS -j oe #PBS -l mem=12gb #PBS -l scratch=4gb @@ -22,8 +22,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 05 - Missing data handling * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -96,6 +96,17 @@ else type="exons" fi +#Settings for (un)corrected reading frame +if [[ $corrected =~ "yes" ]]; then + alnpath=$type/80concatenated_exon_alignments_corrected + alnpathselected=$type/81selected_corrected + treepath=$type/82trees_corrected +else + alnpath=$type/70concatenated_exon_alignments + alnpathselected=$type/71selected + treepath=$type/72trees +fi + #Check necessary file echo -ne "Testing if input data are available..." if [[ $cp =~ "yes" ]]; then @@ -113,36 +124,67 @@ if [[ $cp =~ "yes" ]]; then exit 3 fi else - if [ -d "$path/$type/70concatenated_exon_alignments" ]; then - if [ "$(ls -A $path/$type/70concatenated_exon_alignments)" ]; then - echo -e "OK\n" + if [[ $corrected =~ "yes" ]]; then + if [ -d "$path/$type/80concatenated_exon_alignments_corrected" ]; then + if [ "$(ls -A $path/$type/80concatenated_exon_alignments_corrected)" ]; then + echo -e "OK\n" + else + echo -e "'$path/$type/80concatenated_exon_alignments_corrected' is empty. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + fi else - echo -e "'$path/$type/70concatenated_exon_alignments' is empty. Exiting...\n" + echo -e "'$path/$type/80concatenated_exon_alignments_corrected' is missing. Exiting...\n" rm -d ../workdir05/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/70concatenated_exon_alignments' is missing. Exiting...\n" - rm -d ../workdir05/ 2>/dev/null - exit 3 + if [ -d "$path/$type/70concatenated_exon_alignments" ]; then + if [ "$(ls -A $path/$type/70concatenated_exon_alignments)" ]; then + echo -e "OK\n" + else + echo -e "'$path/$type/70concatenated_exon_alignments' is empty. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + fi + else + echo -e "'$path/$type/70concatenated_exon_alignments' is missing. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + fi fi fi #Test if folder for results exits -if [ -d "$path/$type/71selected${MISSINGPERCENT}" ]; then - echo -e "Directory '$path/$type/71selected${MISSINGPERCENT}' already exists. Delete it or rename before running this script again. Exiting...\n" - rm -d ../workdir05/ 2>/dev/null - exit 3 +if [[ $corrected =~ "yes" ]]; then + if [ -d "$path/$type/81selected_corrected${MISSINGPERCENT}" ]; then + echo -e "Directory '$path/$type/81selected_corrected${MISSINGPERCENT}' already exists. Delete it or rename before running this script again. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + else + if [[ ! $location == "1" ]]; then + if [ "$(ls -A ../workdir05)" ]; then + echo -e "Directory 'workdir05' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + fi + fi + fi else - if [[ ! $location == "1" ]]; then - if [ "$(ls -A ../workdir05)" ]; then - echo -e "Directory 'workdir05' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n" - rm -d ../workdir05/ 2>/dev/null - exit 3 + if [ -d "$path/$type/71selected${MISSINGPERCENT}" ]; then + echo -e "Directory '$path/$type/71selected${MISSINGPERCENT}' already exists. Delete it or rename before running this script again. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + else + if [[ ! $location == "1" ]]; then + if [ "$(ls -A ../workdir05)" ]; then + echo -e "Directory 'workdir05' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n" + rm -d ../workdir05/ 2>/dev/null + exit 3 + fi fi fi fi - #Copy data folder to scratch if [[ $cp =~ "yes" ]]; then cp $path/$type/60mafft/*.fasta . @@ -151,15 +193,15 @@ if [[ $cp =~ "yes" ]]; then # mv "$file" "${file%.fasta.mafft}.fasta" # done else - cp $path/$type/70concatenated_exon_alignments/*.fasta . + cp $path/$alnpath/*.fasta . fi #-----------PREPARE ALIGNMENTS WITH SPECIES WITH MAXIMUM SPECIFIED MISSING DATA ONLY---------------------- #Make a list of all fasta files ls *.fasta | cut -d"." -f1 > fileForDeletePercentage.txt #Make new dir for results -mkdir $path/$type/71selected${MISSINGPERCENT} -mkdir $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} +mkdir $path/${alnpathselected}${MISSINGPERCENT} +mkdir $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} numberfiles=$(cat fileForDeletePercentage.txt | wc -l) calculating=0 for file in $(cat fileForDeletePercentage.txt) @@ -216,13 +258,13 @@ do #sed -i.bak '/^>/{s/ /\n/}' ${file}_modif${MISSINGPERCENT}.fas cat ${file}_modif${MISSINGPERCENT}.fas | tr " " "\n" > tmp && mv tmp ${file}_modif${MISSINGPERCENT}.fas #Copy results home - cp ${file}_${MISSINGPERCENT}percN.fas $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} - cp ${file}_modif${MISSINGPERCENT}.fas $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} + cp ${file}_${MISSINGPERCENT}percN.fas $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} + cp ${file}_modif${MISSINGPERCENT}.fas $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT} calculating=$((calculating + 1)) echo -e "\t$file processed ($calculating out of $numberfiles)\n" done echo -e "\nDeleting samples with more than ${MISSINGPERCENT}% missing data finished." -echo -e "Modified alignments saved in $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" +echo -e "Modified alignments saved in $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" #-----------MAKE A TABLE WITH % OF MISSING DATA IN EACH SPECIES AND ASSEMBLY---------------------- #Prepare header file @@ -236,7 +278,7 @@ awk '{_[FNR]=(_[FNR] OFS $2)}END{for (i=1; i<=FNR; i++) {sub(/^ /,"",_[i]); prin sed -i.bak '1s/^/species\n/' headers.txt #Combine headers and table with missing data paste headers.txt missing_percentage_overview.txt > missing_percentage_overview_and_headers.txt -cp missing_percentage_overview_and_headers.txt $path/$type/71selected${MISSINGPERCENT} +cp missing_percentage_overview_and_headers.txt $path/${alnpathselected}${MISSINGPERCENT} # Transpose data matrix (with percentages of missing data) awk ' @@ -285,7 +327,7 @@ sed -i '1s/ 0/ nr_assemblies_with_completely_missing_data/' transposedPlusMeanPl # Combine AssembliesList.txt and transposedPlusMeanPlusNumberOf100.txt paste AssembliesList.txt transposedPlusMeanPlusNumberOf100.txt > MissingDataOverview.txt # Copy table and list to home -cp MissingDataOverview.txt $path/$type/71selected${MISSINGPERCENT} +cp MissingDataOverview.txt $path/${alnpathselected}${MISSINGPERCENT} echo -e "Table with % of missing data per gene and sample saved to $path/$type/71selected${MISSINGPERCENT}\n" #-----------SELECTION OF MOST COMPLETE ASSEMBLIES---------------------- @@ -339,8 +381,8 @@ cat MissingDataOverview_${MISSINGPERCENT}.txt average_missing_${MISSINGPERCENT}. # Select assemblies with more than 75% of species (and delete first and last line including header and sum legend) awk -F' ' -v val=$SPECIESPRESENCE '( $(NF) > val/100 ) { print $1 }' MissingDataOverview_${MISSINGPERCENT}.txt | tail -n +2 | head -n -2 > selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt # Copy table and list to home -cp MissingDataOverview_${MISSINGPERCENT}.txt $path/$type/71selected${MISSINGPERCENT} -cp selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt $path/$type/71selected${MISSINGPERCENT} +cp MissingDataOverview_${MISSINGPERCENT}.txt $path/${alnpathselected}${MISSINGPERCENT} +cp selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt $path/${alnpathselected}${MISSINGPERCENT} echo -e "Assemblies including at least ${SPECIESPRESENCE}% of species selected" #-----------CALCULATE CHARACTERISTICS FOR ALL GENES AND FOR SELECTED GENES (USING AMAS)---------------------- @@ -397,8 +439,8 @@ else R --slave -f alignmentSummary.R fi #Copy summary table to home -cp summaryALL.txt $path/$type/71selected${MISSINGPERCENT} -cp *.png $path/$type/71selected${MISSINGPERCENT} +cp summaryALL.txt $path/${alnpathselected}${MISSINGPERCENT} +cp *.png $path/${alnpathselected}${MISSINGPERCENT} rm summaryALL.txt rm *.png # 2. For selected genes @@ -455,8 +497,8 @@ mv summaryALL.txt summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt #Rename all PNG files generated by R (add '_${MISSINGPERCENT}_${SPECIESPRESENCE}') for file in *.png; do mv "$file" "${file/.png/_${MISSINGPERCENT}_${SPECIESPRESENCE}.png}"; done #Copy summary table and PNG files to home -cp summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt $path/$type/71selected${MISSINGPERCENT} -cp *.png $path/$type/71selected${MISSINGPERCENT} +cp summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt $path/${alnpathselected}${MISSINGPERCENT} +cp *.png $path/${alnpathselected}${MISSINGPERCENT} #Clean scratch/work directory if [[ $PBS_O_HOST == *".cz" ]]; then diff --git a/HybPhyloMaker6a2_RAxML_trees_summary.sh b/HybPhyloMaker6a2_RAxML_trees_summary.sh index f599ebd..0ebe34d 100644 --- a/HybPhyloMaker6a2_RAxML_trees_summary.sh +++ b/HybPhyloMaker6a2_RAxML_trees_summary.sh @@ -20,8 +20,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 06a2 - summary of RAxML gene trees * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -76,43 +76,54 @@ else type="exons" fi +#Settings for (un)corrected reading frame +if [[ $corrected =~ "yes" ]]; then + alnpath=$type/80concatenated_exon_alignments_corrected + alnpathselected=$type/81selected_corrected + treepath=$type/82trees_corrected +else + alnpath=$type/70concatenated_exon_alignments + alnpathselected=$type/71selected + treepath=$type/72trees +fi + #Check necessary file echo -ne "Testing if input data are available..." -if [ -f "$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then - if [ -d "$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then - if [ "$(ls -A $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then - if [ -d "$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML" ]; then - if [ "$(ls -A $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML)" ]; then +if [ -f "$path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then + if [ -d "$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then + if [ "$(ls -A $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then + if [ -d "$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML" ]; then + if [ "$(ls -A $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML)" ]; then echo -e "OK\n" else - echo -e "'$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' is empty. Exiting...\n" + echo -e "'$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' is empty. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' is missing. Exiting...\n" + echo -e "'$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' is missing. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 fi #Test if folder for results exits -if [ -f "$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/gene_properties.txt" ]; then - echo -e "File '$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/gene_properties.txt' already exists. You are probably going to owerwrite previous results. Delete this file or rename before running this script again. Exiting...\n" +if [ -f "$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/gene_properties.txt" ]; then + echo -e "File '$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/gene_properties.txt' already exists. You are probably going to overwrite previous results. Delete this file or rename before running this script again. Exiting...\n" rm -d ../workdir06a2/ 2>/dev/null exit 3 else @@ -133,12 +144,12 @@ cp $source/LBscores.R . mkdir trees mkdir alignments #Copy all fasta alignments to subfolder 'alignments' -cp $path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . +cp $path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . for i in $(cat selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt); do - cp $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas alignments/ + cp $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas alignments/ done #Copy all RAxML tree files (*.tre) with bootstrap values to subfolder 'trees' -cp $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/*bipartitions.Assembly* trees/ +cp $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML/*bipartitions.*Assembly* trees/ #Rename RAxML trees cd trees for i in *; do @@ -166,9 +177,9 @@ done for i in $(cat LBscores.csv | sed 1d | cut -d"," -f5 | sort | uniq); do grep $i LBscores.csv | awk -F',' -v val=$i '{ if ($5 == val) result=$1 } END { print val "\t" result }' >> LBscoresSDPerLocus.txt done -cp LBscores.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML -cp LBscoresPerTaxon.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML -cp LBscoresSDPerLocus.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp LBscores.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp LBscoresPerTaxon.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp LBscoresSDPerLocus.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML #Combine 'LBscoresSDPerLocus.txt' with 'tree_stats_table.csv' awk '{ print $2 }' LBscoresSDPerLocus.txt > tmp && mv tmp LBscoresSDPerLocus.txt paste tree_stats_table.csv LBscoresSDPerLocus.txt | tr "\t" "," > tmp && mv tmp tree_stats_table.csv @@ -185,15 +196,15 @@ else fi #Copy results to home -cp tree_stats_table.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML -cp *.png $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp tree_stats_table.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp *.png $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML #----------------Combine tree summary table with alignment summary and print comparison plots---------------- #Copy script cp $source/plotting_correlations.R . echo -e "Combining alignment and tree properties...\n" #Copy alignment summary -cp $path/$type/71selected${MISSINGPERCENT}/summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . +cp $path/${alnpathselected}${MISSINGPERCENT}/summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . #***Modify alignment summary*** #Remove '_Assembly' (now locus names start with a number) sed -i.bak 's/Assembly_//g' summarySELECTED*.txt @@ -237,13 +248,13 @@ if [[ $location == "1" ]]; then else R --slave -f plotting_correlations.R >> R.log 2>&1 fi -cp genes_corrs.* $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp genes_corrs.* $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML rm genes_corrs.* mv combined.txt gene_properties.txt -cp gene_properties.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp gene_properties.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML #Copy R.log to home -cp R.log $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp R.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML #Clean scratch/work directory if [[ $PBS_O_HOST == *".cz" ]]; then diff --git a/HybPhyloMaker6a_RAxML_for_selected.sh b/HybPhyloMaker6a_RAxML_for_selected.sh index 578d6c1..3a6200c 100644 --- a/HybPhyloMaker6a_RAxML_for_selected.sh +++ b/HybPhyloMaker6a_RAxML_for_selected.sh @@ -20,8 +20,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 06a - RAxML gene tree building * -# * v.1.3.3 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2015 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -77,31 +77,50 @@ else type="exons" fi +#Settings for (un)corrected reading frame +if [[ $corrected =~ "yes" ]]; then + alnpath=$type/80concatenated_exon_alignments_corrected + alnpathselected=$type/81selected_corrected + treepath=$type/82trees_corrected +else + alnpath=$type/70concatenated_exon_alignments + alnpathselected=$type/71selected + treepath=$type/72trees +fi + +#Check compatible setting (corrected=no is incompatible with genepart=codon) +if [[ $genetreepart == "codon" ]] && [[ $corrected == "no" ]]; then + echo "You have incompatible settings (partitioning by codon [genetreepart=codon] is not allowed with uncorrected data [corrected=no]." + echo "Change the settings before running the script again..." + rm -d ../workdir06a/ 2>/dev/null + exit 3 +fi + #Check necessary file echo -ne "Testing if input data are available..." -if [ -d "$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then - if [ "$(ls -A $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then - if [ -f "$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then +if [ -d "$path/$alnpathselected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then + if [ "$(ls -A $path/$alnpathselected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then + if [ -f "$path/$alnpathselected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then echo -e "OK\n" else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" + echo -e "'$path/$alnpathselected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" rm -d ../workdir06a/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" + echo -e "'$path/$alnpathselected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" rm -d ../workdir06a/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" + echo -e "'$path/$alnpathselected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" rm -d ../workdir06a/ 2>/dev/null exit 3 fi #Test if folder for results exits -if [ -d "$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML" ]; then - echo -e "Directory '$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' already exists. Delete it or rename before running this script again. Exiting...\n" +if [ -d "$path/$treepath${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML" ]; then + echo -e "Directory '$path/$treepath${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML' already exists. Delete it or rename before running this script again. Exiting...\n" rm -d ../workdir06a/ 2>/dev/null exit 3 else @@ -115,9 +134,9 @@ else fi #Add necessary scripts and files -cp $path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . -mkdir -p $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE} -mkdir $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp $path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . +mkdir -p $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE} +mkdir $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML if [[ $location == "1" || $location == "2" ]]; then echo -e "\nGenerating multiple jobs with $raxmlperjob alignments per job..." @@ -125,7 +144,7 @@ if [[ $location == "1" || $location == "2" ]]; then #Divide selected_genes$CUT.txt into files by $raxmlperjob split --lines=$raxmlperjob selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}. rm selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt - cp selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.* $path/$type/71selected${MISSINGPERCENT} + cp selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.* $path/${alnpathselected}${MISSINGPERCENT} for group in $(ls selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.*) do echo '#!/bin/bash' >> ${group}.sh @@ -163,15 +182,30 @@ if [[ $location == "1" || $location == "2" ]]; then echo 'MISSINGPERCENT='"$MISSINGPERCENT" >> ${group}.sh echo 'SPECIESPRESENCE='"$SPECIESPRESENCE" >> ${group}.sh echo 'type='"$type" >> ${group}.sh + echo 'corrected='"$corrected" >> ${group}.sh echo 'location='"$location" >> ${group}.sh echo 'genetreepart='"$genetreepart" >> ${group}.sh echo 'raxmlpthreads='"$raxmlpthreads" >> ${group}.sh - echo 'cp '"$path"'/$type/71selected${MISSINGPERCENT}/'"$group"' .' >> ${group}.sh + echo 'if [[ $corrected =~ "yes" ]]; then' >> ${group}.sh + echo ' alnpath=$type/80concatenated_exon_alignments_corrected' >> ${group}.sh + echo ' alnpathselected=$type/81selected_corrected' >> ${group}.sh + echo ' treepath=$type/82trees_corrected' >> ${group}.sh + echo 'else' >> ${group}.sh + echo ' alnpath=$type/70concatenated_exon_alignments' >> ${group}.sh + echo ' alnpathselected=$type/71selected' >> ${group}.sh + echo ' treepath=$type/72trees' >> ${group}.sh + echo 'fi' >> ${group}.sh + echo 'cp '"$path"'/${alnpathselected}${MISSINGPERCENT}/'"$group"' .' >> ${group}.sh echo 'cp '"$source"'/catfasta2phyml.pl .' >> ${group}.sh echo 'for i in $(cat '"$group"')' >> ${group}.sh echo 'do' >> ${group}.sh - echo ' cp '"$path"'/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas .' >> ${group}.sh - echo ' cp $path/$type/70concatenated_exon_alignments/${i}.part .' >> ${group}.sh + echo ' cp '"$path"'/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas .' >> ${group}.sh + echo ' if [[ $genetreepart == "exon" ]]; then' >> ${group}.sh + echo ' cp $path/${alnpath}/${i}.part .' >> ${group}.sh + echo ' elif [[ $genetreepart == "codon" ]]; then' >> ${group}.sh + echo ' cp $path/${alnpath}/${i}.codonpart.file .' >> ${group}.sh + echo ' mv ${i}.codonpart.file ${i}.part' >> ${group}.sh + echo ' fi' >> ${group}.sh echo ' #Substitute '"'('"' by '"'_'"' and '"')'"' by nothing ('"'('"' and '"')'"' not allowed in RAxML)' >> ${group}.sh echo ' sed -i '"'s/(/_/g'"' ${i}_modif${MISSINGPERCENT}.fas' >> ${group}.sh echo ' sed -i '"'s/)//g'"' ${i}_modif${MISSINGPERCENT}.fas' >> ${group}.sh @@ -186,20 +220,43 @@ if [[ $location == "1" || $location == "2" ]]; then echo ' #modify name for partition file (remove '_modif${MISSINGPERCENT}')' >> ${group}.sh echo ' filepart=$(sed "s/_modif${MISSINGPERCENT}//" <<< $file)' >> ${group}.sh echo ' #RAxML with 100 rapid bootstrap' >> ${group}.sh - echo ' if [[ $location == "1" ]]; then' >> ${group}.sh - echo ' if [[ $genetreepart == "yes" ]]; then' >> ${group}.sh - echo ' raxmlHPC-PTHREADS -T $TORQUE_RESC_TOTAL_PROCS -f a -s $file.phylip -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh + echo ' #1.Check if there are completely undetermined columns in alignment (RAxML -y will produced .reduced alignment and partition files)' >> ${group}.sh + echo ' # Compute parsimony tree only and produce reduced alignment and appropriate reduced partition file' >> ${group}.sh + echo ' if [[ $genetreepart == "no" ]]; then' >> ${group}.sh + echo ' $raxmlseq -y -m GTRCAT -p 12345 -s $file.fas -n $file.check >> raxml.log' >> ${group}.sh + echo ' else' >> ${group}.sh + echo ' $raxmlseq -y -m GTRCAT -p 12345 -s $file.fas -q $filepart.part -n $file.check >> raxml.log' >> ${group}.sh + echo ' fi' >> ${group}.sh + echo ' #2.Test if reduced files were produced' >> ${group}.sh + echo ' if [ -f $file.fas.reduced ]; then' >> ${group}.sh + echo ' echo "Reduced alignment found...using it" >> raxml.log' >> ${group}.sh + echo ' if [[ $genetreepart == "no" ]]; then' >> ${group}.sh + echo ' rm $file.fas' >> ${group}.sh + echo ' mv $file.fas.reduced $file.fas' >> ${group}.sh echo ' else' >> ${group}.sh + echo ' rm $filepart.part' >> ${group}.sh + echo ' mv $filepart.part.reduced $filepart.part' >> ${group}.sh + echo ' rm $file.fas' >> ${group}.sh + echo ' mv $file.fas.reduced $file.fas' >> ${group}.sh + echo ' fi' >> ${group}.sh + echo ' else' >> ${group}.sh + echo ' echo "Reduced alignment not found...using original alignment" >> raxml.log' >> ${group}.sh + echo ' fi' >> ${group}.sh + echo ' #3.Run RAxML' >> ${group}.sh + echo ' if [[ $location == "1" ]]; then' >> ${group}.sh + echo ' if [[ $genetreepart == "no" ]]; then' >> ${group}.sh echo ' raxmlHPC-PTHREADS -T $TORQUE_RESC_TOTAL_PROCS -f a -s $file.phylip -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh + echo ' else' >> ${group}.sh + echo ' raxmlHPC-PTHREADS -T $TORQUE_RESC_TOTAL_PROCS -f a -s $file.phylip -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh echo ' fi' >> ${group}.sh echo ' elif [[ $location == "2" ]]; then' >> ${group}.sh - echo ' if [[ $genetreepart == "yes" ]]; then' >> ${group}.sh - echo ' $raxmlpthreads -T $NSLOTS -f a -s $file.phylip -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh - echo ' else' >> ${group}.sh + echo ' if [[ $genetreepart == "no" ]]; then' >> ${group}.sh echo ' $raxmlpthreads -T $NSLOTS -f a -s $file.phylip -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh + echo ' else' >> ${group}.sh + echo ' $raxmlpthreads -T $NSLOTS -f a -s $file.phylip -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100' >> ${group}.sh echo ' fi' >> ${group}.sh echo ' fi' >> ${group}.sh - echo ' cp *$file.result '"$path"'/$type/72trees'"${MISSINGPERCENT}_${SPECIESPRESENCE}"'/RAxML' >> ${group}.sh + echo ' cp *$file.result '"${path}/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}"'/RAxML' >> ${group}.sh echo 'done' >> ${group}.sh echo '#Clean scratch/work directory' >> ${group}.sh echo 'if [[ $PBS_O_HOST == *".cz" ]]; then' >> ${group}.sh @@ -212,10 +269,10 @@ if [[ $location == "1" || $location == "2" ]]; then chmod +x ${group}.sh if [[ $location == "1" ]]; then - cp ${group}.sh $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML + cp ${group}.sh $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML qsub ${group}.sh else - cp ${group}.sh $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML + cp ${group}.sh $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML cp ${group}.sh .. echo 'qsub '"${group}"'.sh' >> ../submitRAxMLjobs.sh fi @@ -227,8 +284,13 @@ else echo -e "Modifying selected FASTA files...\n" for i in $(cat selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt) do - cp $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas . - cp $path/$type/70concatenated_exon_alignments/${i}.part . + cp $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas . + if [[ $genetreepart == "exon" ]]; then + cp $path/${alnpath}/${i}.part . + elif [[ $genetreepart == "codon" ]]; then + cp $path/${alnpath}/${i}.codonpart.file . + mv ${i}.codonpart.file ${i}.part + fi #Substitute '(' by '_' and ')' by nothing ('(' and ')' not allowed in RAxML) sed -i.bak 's/(/_/g' ${i}_modif${MISSINGPERCENT}.fas sed -i.bak 's/)//g' ${i}_modif${MISSINGPERCENT}.fas @@ -248,15 +310,38 @@ else #modify name for partition file (remove '_modif${MISSINGPERCENT}' filepart=$(sed "s/_modif${MISSINGPERCENT}//" <<< $file) #run RAxML + #1.Check if there are completely undetermined columns in alignment (RAxML -y will produced .reduced alignment and partition files) + # Compute parsimony tree only and produce reduced alignment and appropriate reduced partition file + if [[ $genetreepart == "no" ]]; then + $raxmlseq -y -m GTRCAT -p 12345 -s $file.fas -n $file.check >> raxml.log + else + $raxmlseq -y -m GTRCAT -p 12345 -s $file.fas -q $filepart.part -n $file.check >> raxml.log + fi + #2.Test if reduced files were produced + if [ -f $file.fas.reduced ]; then + echo "Reduced alignment found...using it" >> raxml.log + if [[ $genetreepart == "no" ]]; then + rm $file.fas + mv $file.fas.reduced $file.fas + else + rm $filepart.part + mv $filepart.part.reduced $filepart.part + rm $file.fas + mv $file.fas.reduced $file.fas + fi + else + echo "Reduced alignment not found...using original alignment" >> raxml.log + fi + #3.Run RAxML if [[ $genetreepart == "yes" ]]; then - $raxmlpthreads -T $numbcores -f a -s $file.fas -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100 >> raxml.log + $raxmlpthreads -T $numbcores -f a -s $file.fas -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100 >> raxml.log else - $raxmlpthreads -T $numbcores -f a -s $file.fas -q -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100 >> raxml.log + $raxmlpthreads -T $numbcores -f a -s $file.fas -q $filepart.part -n $file.result -m GTRCAT -p 1234 -x 1234 -N 100 >> raxml.log fi - cp *$file.result $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML + cp *${file}.result $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML done #Copy raxml.log to home -cp raxml.log $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML +cp raxml.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/RAxML fi #Clean scratch/work directory @@ -275,5 +360,6 @@ if [[ $location == "2" ]]; then echo -e "\nGo to homedir and run submitRAxMLjobs.sh..." elif [[ $location == "1" ]]; then echo -e "\nAfter all jobs finish run script HybPhyloMaker6a2 in order to calculate tree properties..." +elif [[ $location == "0" ]]; then + echo -e "\nRun script HybPhyloMaker6a2 in order to calculate tree properties..." fi -echo -e "\nRun script HybPhyloMaker6a2 in order to calculate tree properties..." diff --git a/HybPhyloMaker6b_FastTree_for_selected.sh b/HybPhyloMaker6b_FastTree_for_selected.sh index cac1c9c..cd1d039 100644 --- a/HybPhyloMaker6b_FastTree_for_selected.sh +++ b/HybPhyloMaker6b_FastTree_for_selected.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=1d -#PBS -l nodes=1:ppn=4 +#PBS -l nodes=1:ppn=4:minspec=29 #PBS -j oe #PBS -l mem=16gb #PBS -l scratch=4gb @@ -21,8 +21,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 06b - FastTree gene tree building * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -86,31 +86,42 @@ else type="exons" fi +#Settings for (un)corrected reading frame +if [[ $corrected =~ "yes" ]]; then + alnpath=$type/80concatenated_exon_alignments_corrected + alnpathselected=$type/81selected_corrected + treepath=$type/82trees_corrected +else + alnpath=$type/70concatenated_exon_alignments + alnpathselected=$type/71selected + treepath=$type/72trees +fi + #Check necessary file echo -ne "Testing if input data are available..." -if [ -f "$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then - if [ -d "$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then - if [ "$(ls -A $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then +if [ -f "$path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt" ]; then + if [ -d "$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}" ]; then + if [ "$(ls -A $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT})" ]; then echo -e "OK\n" else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is empty. Exiting...\n" rm -d ../workdir06b/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}' is missing. Exiting...\n" rm -d ../workdir06b/ 2>/dev/null exit 3 fi else - echo -e "'$path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" + echo -e "'$path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt' is missing. Exiting...\n" rm -d ../workdir06b/ 2>/dev/null exit 3 fi #Test if folder for results exits -if [ -d "$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree" ]; then - echo -e "Directory '$path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree' already exists. Delete it or rename before running this script again. Exiting...\n" +if [ -d "$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree" ]; then + echo -e "Directory '$path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree' already exists. Delete it or rename before running this script again. Exiting...\n" rm -d ../workdir06b/ 2>/dev/null exit 3 else @@ -129,12 +140,12 @@ cp $source/catfasta2phyml.pl . cp $source/AMAS.py . cp $source/CompareToBootstrap.pl . cp $source/MOTree.pm . -cp $path/$type/71selected${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . +cp $path/${alnpathselected}${MISSINGPERCENT}/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . # Copy and modify selected FASTA files echo -en "\nModifying selected FASTA files..." for i in $(cat selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt) do - cp $path/$type/71selected${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas . + cp $path/${alnpathselected}${MISSINGPERCENT}/deleted_above${MISSINGPERCENT}/${i}_modif${MISSINGPERCENT}.fas . #Substitute '(' by '_' and ')' by nothing ('(' and ')' not allowed in FastTree) sed -i.bak 's/(/_/g' ${i}_modif${MISSINGPERCENT}.fas sed -i.bak 's/)//g' ${i}_modif${MISSINGPERCENT}.fas @@ -145,8 +156,8 @@ done #Make a list of all fasta files ls *.fas | cut -d"." -f1 > FileForFastTree.txt #Make dir for results -mkdir -p $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE} -mkdir -p $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +mkdir -p $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE} +mkdir -p $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree echo -e "done\n" #----------------Generate gene trees using FastTree---------------- @@ -165,10 +176,10 @@ for file in $(cat FileForFastTree.txt); do else $fasttreebin -nt ${file}.fas > ${file}.fast.tre 2>>FastTree.log fi - cp *$file.fast.tre $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp *$file.fast.tre $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree done #Copy log to home -cp FastTree.log $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp FastTree.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #----------------Generate bootstrapped gene trees using FastTree---------------- #Bootstrap using FastTree @@ -229,11 +240,11 @@ if [[ $FastTreeBoot =~ "yes" ]]; then #Map bootstrap support values onto the original tree perl ./CompareToBootstrap.pl -tree *${file}.fast.tre -boot ${file}.boot.fast.trees > ${file}.boot.fast.tre #Copy bootstrap trees and final tree with bootstrap values to home - cp ${file}.boot.fast.trees $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree - cp ${file}.boot.fast.tre $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp ${file}.boot.fast.trees $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp ${file}.boot.fast.tre $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree done #Copy logs to home - cp *.log $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp *.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree fi #----------------Make a summary table with statistical properties for trees using R---------------- @@ -266,9 +277,9 @@ done for i in $(cat LBscores.csv | sed 1d | cut -d"," -f5 | sort | uniq); do grep $i LBscores.csv | awk -F',' -v val=$i '{ if ($5 == val) result=$1 } END { print val "\t" result }' >> LBscoresSDPerLocus.txt done -cp LBscores.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree -cp LBscoresPerTaxon.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree -cp LBscoresSDPerLocus.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp LBscores.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp LBscoresPerTaxon.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp LBscoresSDPerLocus.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Combine 'LBscoresSDPerLocus.txt' with 'tree_stats_table.csv' awk '{ print $2 }' LBscoresSDPerLocus.txt > tmp && mv tmp LBscoresSDPerLocus.txt paste tree_stats_table.csv LBscoresSDPerLocus.txt | tr "\t" "," > tmp && mv tmp tree_stats_table.csv @@ -285,8 +296,8 @@ else fi #Copy results to home -cp tree_stats_table.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree -cp *.png $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp tree_stats_table.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp *.png $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Remove results rm *.png rm LB*.txt LB*.csv @@ -318,9 +329,9 @@ if [[ $FastTreeBoot =~ "yes" ]]; then mv LBscores.csv LBscores_bootstrap.csv mv LBscoresPerTaxon.txt LBscoresPerTaxon_bootstrap.txt mv LBscoresSDPerLocus.txt LBscoresSDPerLocus_bootstrap.txt - cp LBscores_bootstrap.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree - cp LBscoresPerTaxon_bootstrap.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree - cp LBscoresSDPerLocus_bootstrap.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp LBscores_bootstrap.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp LBscoresPerTaxon_bootstrap.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp LBscoresSDPerLocus_bootstrap.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Combine 'LBscoresSDPerLocus_bootstrap.txt' with 'tree_stats_table.csv' awk '{ print $2 }' LBscoresSDPerLocus_bootstrap.txt > tmp && mv tmp LBscoresSDPerLocus_bootstrap.txt paste tree_stats_table.csv LBscoresSDPerLocus_bootstrap.txt | tr "\t" "," > tmp && mv tmp tree_stats_table.csv @@ -337,10 +348,10 @@ if [[ $FastTreeBoot =~ "yes" ]]; then fi #Rename results and copy results to home mv tree_stats_table.csv tree_stats_table_bootstrap.csv - cp tree_stats_table_bootstrap.csv $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp tree_stats_table_bootstrap.csv $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Rename all PNG files generated by R (add 'bootstrap')and copy them to home for file in *.png; do mv "$file" "${file/.png/_bootstrap.png}"; done - cp *.png $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp *.png $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree fi #----------------Combine tree summary table with alignment summary and print comparison plots---------------- @@ -348,7 +359,7 @@ fi cp $source/plotting_correlations.R . echo -e "Combining alignment and tree properties...\n" #Copy alignment summary -cp $path/$type/71selected${MISSINGPERCENT}/summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . +cp $path/${alnpathselected}${MISSINGPERCENT}/summarySELECTED_${MISSINGPERCENT}_${SPECIESPRESENCE}.txt . #***Modify alignment summary*** #Remove '_Assembly' (now locus names start with a number) sed -i.bak 's/Assembly_//g' summarySELECTED*.txt @@ -405,10 +416,10 @@ if [[ $location == "1" ]]; then else R --slave -f plotting_correlations.R >> R.log 2>&1 fi -cp genes_corrs.* $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp genes_corrs.* $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree rm genes_corrs.* mv combined.txt gene_properties.txt -cp gene_properties.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp gene_properties.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Run comparison plots for FastTrees with true bootstrap ('bootstrap') if [[ $FastTreeBoot =~ "yes" ]]; then @@ -424,14 +435,14 @@ if [[ $FastTreeBoot =~ "yes" ]]; then fi mv genes_corrs.png genes_corrs_bootstrap.png mv genes_corrs.pdf genes_corrs_bootstrap.pdf - cp genes_corrs_bootstrap.* $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp genes_corrs_bootstrap.* $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree rm genes_corrs_bootstrap.* mv combined.txt gene_properties_bootstrap.txt - cp gene_properties_bootstrap.txt $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree + cp gene_properties_bootstrap.txt $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree fi #Copy R.log to home -cp R.log $path/$type/72trees${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree +cp R.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/FastTree #Clean scratch/work directory if [[ $PBS_O_HOST == *".cz" ]]; then diff --git a/HybPhyloMaker7_roottrees.sh b/HybPhyloMaker7_roottrees.sh index d871b99..f032912 100644 --- a/HybPhyloMaker7_roottrees.sh +++ b/HybPhyloMaker7_roottrees.sh @@ -19,8 +19,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 07 - Root gene trees * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker8a_astral.sh b/HybPhyloMaker8a_astral.sh index 88a492f..f3d72ae 100644 --- a/HybPhyloMaker8a_astral.sh +++ b/HybPhyloMaker8a_astral.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=2h -#PBS -l nodes=1:ppn=4 +#PBS -l nodes=1:ppn=4:debian8:minspec=29 #PBS -j oe #PBS -l mem=16gb #PBS -l scratch=2gb @@ -21,8 +21,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 08a - Astral species tree * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -52,6 +52,7 @@ if [[ $PBS_O_HOST == *".cz" ]]; then module add python27-modules-gcc module add python-3.4.1-intel module add newick-utils-1.6 + module add p4 elif [[ $HOSTNAME == compute-*-*.local ]]; then echo -e "\nHybPhyloMaker8a is running on Hydra..." #settings for Hydra @@ -66,6 +67,7 @@ elif [[ $HOSTNAME == compute-*-*.local ]]; then module load java/1.7 module load bioinformatics/anaconda3/2.3.0 module load bioinformatics/newickutilities/0.0 + module load bioinformatics/p4/ #??? else echo -e "\nHybPhyloMaker8a is running locally..." #settings for local run diff --git a/HybPhyloMaker8b_astrid.sh b/HybPhyloMaker8b_astrid.sh index b4fa887..dac43bb 100644 --- a/HybPhyloMaker8b_astrid.sh +++ b/HybPhyloMaker8b_astrid.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=2h -#PBS -l nodes=1:ppn=4 +#PBS -l nodes=1:ppn=4:debian8 #PBS -j oe #PBS -l mem=8gb #PBS -N HybPhyloMaker8b_Astrid @@ -20,8 +20,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 08b - Astrid species tree * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -53,6 +53,7 @@ if [[ $PBS_O_HOST == *".cz" ]]; then module add python27-modules-gcc module add python-3.4.1-intel module add newick-utils-1.6 + module add p4 elif [[ $HOSTNAME == *local* ]]; then echo -e "\nHybPhyloMaker8b is running on Hydra..." #settings for Hydra @@ -66,6 +67,7 @@ elif [[ $HOSTNAME == *local* ]]; then #Add necessary modules module load bioinformatics/anaconda3/2.3.0 module load bioinformatics/newickutilities/0.0 + module load bioinformatics/p4/ #??? else echo -e "\nHybPhyloMaker8b is running locally..." #settings for local run diff --git a/HybPhyloMaker8c_mrl.sh b/HybPhyloMaker8c_mrl.sh index e7f4ff8..acce073 100644 --- a/HybPhyloMaker8c_mrl.sh +++ b/HybPhyloMaker8c_mrl.sh @@ -21,8 +21,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 08c - MRL species tree * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker8e_concatenatedFastTree.sh b/HybPhyloMaker8e_concatenatedFastTree.sh index e21b749..74a086c 100644 --- a/HybPhyloMaker8e_concatenatedFastTree.sh +++ b/HybPhyloMaker8e_concatenatedFastTree.sh @@ -1,7 +1,7 @@ #!/bin/bash #----------------MetaCentrum---------------- #PBS -l walltime=2d -#PBS -l nodes=1:ppn=8 +#PBS -l nodes=1:ppn=8:minspec=29 #PBS -j oe #PBS -l mem=24gb #PBS -l scratch=8gb @@ -21,8 +21,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 08e - concatenated species tree * -# * v.1.3.1 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** diff --git a/HybPhyloMaker8f_concatenatedExaML.sh b/HybPhyloMaker8f_concatenatedExaML.sh index b09891f..e3dcb6f 100644 --- a/HybPhyloMaker8f_concatenatedExaML.sh +++ b/HybPhyloMaker8f_concatenatedExaML.sh @@ -1,17 +1,17 @@ #!/bin/bash #PBS -l walltime=2d -#PBS -l nodes=4:ppn=16:infiniband +#PBS -l nodes=1:ppn=12:minspec=29 #PBS -j oe #PBS -l mem=4gb -#PBS -l scratch=32gb:shared +#PBS -l scratch=32gb #PBS -N HybPhyloMaker8f_ExaMLconcatenated_tree #PBS -m abe # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 08f - ExaML concatenated tree * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -222,7 +222,7 @@ else module add hdf5-1.8.12-gcc # Run PartitionFinder #python partitionfinder/PartitionFinder.py PartitionFinder --raxml --rcluster-max 1000 --rcluster-percent 10 - python partitionfinder/PartitionFinder.py PartitionFinder --raxml + python partitionfinder/PartitionFinder.py -p $TORQUE_RESC_TOTAL_PROCS PartitionFinder --raxml #Prepare RAxML (ExaML) partition file from PartitionFinder results grep "DNA, Subset" PartitionFinder/analysis/best_scheme.txt > RAxMLpartitions.txt echo -e "\nPartitionFinder finished...\n" diff --git a/HybPhyloMaker9_update_trees.sh b/HybPhyloMaker9_update_trees.sh index b8cafc1..c46ee5a 100644 --- a/HybPhyloMaker9_update_trees.sh +++ b/HybPhyloMaker9_update_trees.sh @@ -19,8 +19,8 @@ # ******************************************************************************** # * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building * # * Script 09 - Update trees * -# * v.1.3.2 * -# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2016 * +# * v.1.4.0 * +# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2017 * # * tomas.fer@natur.cuni.cz * # ******************************************************************************** @@ -166,7 +166,7 @@ cp *histogram.png $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/${tree}/ #Copy R.log to home cp R.log $path/${treepath}${MISSINGPERCENT}_${SPECIESPRESENCE}/${tree}/update/ #Prepare list of genes of updated selection -cat gene_properties_update.txt | sed 1d | cut -f1 | sort | sed 's/Corrected/CorrectedAssembly_/g' | sed "s/_modif${MISSINGPERCENT}.fas//g" > selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}_update.txt +cat gene_properties_update.txt | sed 1d | cut -f1 | sort | sed 's/Corrected//g' | sed "s/_modif${MISSINGPERCENT}.fas//g" > selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}_update.txt mkdir -p $path/${alnpathselected}${MISSINGPERCENT}/updatedSelectedGenes cp selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}_update.txt $path/${alnpathselected}${MISSINGPERCENT}/updatedSelectedGenes echo -e "\nList of updated selected genes saved to $path/${alnpathselected}${MISSINGPERCENT}/updatedSelectedGenes/selected_genes_${MISSINGPERCENT}_${SPECIESPRESENCE}_update.txt..." diff --git a/install_software.sh b/install_software.sh index a40ce8a..20abce1 100644 --- a/install_software.sh +++ b/install_software.sh @@ -5,10 +5,10 @@ # Without changes works only on 64-bit platforms (x86_64) # # This script MUST be run with root privileges! # # # -# Tomas Fer, 2016 # +# Tomas Fer, 2017 # # tomas.fer@natur.cuni.cz # # https://github.com/tomas-fer/HybPhyloMaker # -# v.1.3.2 # +# v.1.4.0 # ########################################################################################################################## #Carefully set your distribution @@ -59,7 +59,7 @@ if ! [ -x "$(command -v python)" ]; then $installer install -y python &> python_install.log fi -#Python3 +#Python3 (+ Biopython) if ! [ -x "$(command -v python3)" ]; then if [[ $distribution =~ "CentOS" ]]; then echo -e "Installing 'python3'" @@ -68,9 +68,33 @@ if ! [ -x "$(command -v python3)" ]; then else echo -e "Installing 'python3'" $installer install -y python3 &> python3_install.log #Does not work on CentOS + fi fi +#Pip +if ! [ -x "$(command -v pip)" ]; then + echo -e "Installing 'pip'" + $installer install -y python-pip &> pip_install.log +fi + +#Pip3 (also required for 'kindel' installation, see below) +if ! [ -x "$(command -v pip3)" ]; then + if [[ $distribution =~ "CentOS" ]]; then + echo -e "Installing 'pip3'" + $installer install -y python34-devel &>> python3_install.log #Only for CentOS + $installer install -y python34-pip &> pip3_install.log #Only for CentOS + else + echo -e "Installing 'pip3'" + $installer install -y python3-dev &>> python3_install.log #Does not work on CentOS + $installer install -y python3-pip &> pip3_install.log #Does not work on CentOS + fi +fi + +#Biopython +echo -e "Installing 'biopython'" +pip3 install biopython &> biopython_install.log + #Java if [[ $distribution =~ "Debian" ]]; then echo -e "Installing 'java'" @@ -104,15 +128,9 @@ elif [[ $distribution =~ "Fedora" ]] || [[ $distribution =~ "CentOS" ]]; then fi #R -#Comment for Ubuntu: you should install the newest version of R by adding CRAN mirror to /etc/apt/sources.list -#(see, e.g., https://cran.r-project.org/bin/linux/ubuntu/README.html) -# codename=$(lsb_release -c -s) -# echo "deb http://cran.cnr.berkeley.edu/bin/linux/ubuntu $codename/" | sudo tee -a /etc/apt/sources.list > /dev/null -# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9 -# add-apt-repository ppa:marutter/rdev -# apt-get update -# apt-get upgrade -# apt-get install -y r-base r-base-dev +#Comment for Ubuntu/Debian: you should install the newest version of R by adding CRAN mirror to /etc/apt/sources.list +#Look at the end of this script for an advice how to do that... +#Newer version (i.e., at least v3.2) should be installed before running this script! if [[ $distribution =~ "Debian" ]]; then echo -e "Installing 'R'" $installer install -y r-base-dev &> R_install.log #Debian/Ubuntu @@ -221,6 +239,20 @@ if ! [ -x "$(command -v FastTree)" ]; then cd .. fi +#EMBOSS +if ! [ -x "$(command -v transeq)" ]; then + echo -e "Installing 'EMBOSS'" + wget ftp://emboss.open-bio.org/pub/EMBOSS/emboss-latest.tar.gz &> ../EMBOSS_install.log + tar xfz emboss-latest.tar.gz 1>/dev/null + rm emboss-latest.tar.gz + cd EMBOSS* + ./configure &>> ../EMBOSS_install.log + make &>> ../EMBOSS_install.log + ldconfig + make install &>> ../EMBOSS_install.log + cd .. +fi + #RAxML git clone https://github.com/stamatak/standard-RAxML &> raxml_install.log cd standard-RAxML @@ -354,6 +386,12 @@ if ! [ -x "$(command -v ococo)" ]; then cd .. fi +#kindel (necessary for majority rule consensus building from mapped reads in BAM file) +#see https://pypi.python.org/pypi/kindel +if ! [ -x "$(command -v kindel)" ]; then + pip3 install kindel &>> kindel_install.log +fi + #p4 (only necessary for combining bootstrap support in Astral and Astrid trees) #see http://p4.nhm.ac.uk/installation.html #For compilation on Fedora/CentOS/OpenSUSE you need to specify where 'gsl' is installed (in setup.py) - modification of 'setup.py' is included below @@ -381,14 +419,16 @@ if ! [ -x "$(command -v p4)" ]; then if [[ $distribution =~ "Fedora" ]] || [[ $distribution =~ "CentOS" ]]; then $installer install redhat-rpm-config &> rpm-config_install.log fi + #install python module 'future' + pip install future &> python-future_install.log git clone https://github.com/pgfoster/p4-phylogenetics &> p4_install.log cd p4-phylogenetics #Modify setup.py to be able to find gsl if [[ $distribution =~ "OpenSUSE" ]] || [[ $distribution =~ "CentOS" ]] || [[ $distribution =~ "Fedora" ]]; then replace="my_include_dirs = [\'\/usr\/include\/\']" - sed -i.bak "45s/.*/$replace/" setup.py - sed -i.bak2 "46s/# //" setup.py + sed -i.bak "46s/.*/$replace/" setup.py + sed -i.bak2 "47s/# //" setup.py fi python setup.py build &>> ../p4_install.log python setup.py install &>> ../p4_install.log @@ -402,7 +442,7 @@ cd .. #Check if everything is installed correctly echo -e "\n**************************************************************" echo -e "Software installed...checking for binaries in PATH" -for i in parallel bowtie2 ococo samtools bam2fastq java fastuniq perl blat mafft python python3 trimal mstatx FastTree nw_reroot nw_topology raxmlHPC raxmlHPC-PTHREADS R p4; do +for i in parallel bowtie2 ococo kindel samtools transeq bam2fastq java fastuniq perl blat mafft python python3 trimal mstatx FastTree nw_reroot nw_topology raxmlHPC raxmlHPC-PTHREADS R p4; do command -v $i >/dev/null 2>&1 || { echo -n $i; echo >&2 "...not found"; } done @@ -433,7 +473,7 @@ echo -e "\nInstalation script finished.\n" # Successfully tested on # - Ubuntu 14.04 LTS #Newer R version necessary, see below!!! -# - Debian 8.6 +# - Debian 8.6 #Newer R version necessary, see below!!! # - OpenSUSE 42 # - CentOS 7.2 # - Fedora 24 @@ -465,14 +505,13 @@ echo -e "\nInstalation script finished.\n" #NewickUtilities will not work! #********************************************************************************** -#Debian 7 +#Debian 7, 8 #too old R version (see https://cran.r-project.org/bin/linux/debian/ how to update) # codename=$(lsb_release -c -s) # echo "deb http://cran.cnr.berkeley.edu/bin/linux/debian ${codename}-cran3/" | sudo tee -a /etc/apt/sources.list > /dev/null # apt-key adv --keyserver keys.gnupg.net --recv-key 6212B7B7931C4BB16280BA1306F90DE5381BA480 -# apt-get install -y r-base r-base-dev # apt-get update # apt-get upgrade # -#and too old glibc. GLIBC_2.14 is required - and it is not easy to upgrade, better to upgrade to Debian 8 +#only Debian 7 - too old glibc. GLIBC_2.14 is required - and it is not easy to upgrade, better to upgrade to Debian 8 #NewickUtilities will not work! diff --git a/settings.cfg b/settings.cfg index 0f0fae0..704c337 100644 --- a/settings.cfg +++ b/settings.cfg @@ -1,8 +1,8 @@ #**************************************** #* Configuration file for HybPhyloMaker * -#* Tomas Fer, 2016 * +#* Tomas Fer, 2017 * #* tomas.fer@natur.cuni.cz * -#* v.1.3.2 * +#* v.1.4.0 * #**************************************** #-------------------------------------------------------------------------------------------------------------------- @@ -26,8 +26,9 @@ tree=FastTree #Bootstrap FastTree (yes/no). no=normal FastTree with local branch support, very fast; yes=FastTree is applied to each of 100 bootstrap replicates, slow FastTreeBoot=no -#Use 'by exon partitioning' when building gene trees with RAxML (yes/no) -genetreepart=yes +#Use 'no', 'by exon' or 'by codon and exon' partitioning when building gene trees with RAxML (no, exon, codon) +#Codon is only for the data with corrected reading frame! +genetreepart=exon #Outgroup (as it appears in RAxML files) OUTGROUP="Siphonochilus-aethiopicus_S130" @@ -56,6 +57,12 @@ cp=no #Working with updated list of genes (yes/no) update=no +#Working with corrected reading frame for exons/genes (yes/no) - not yet fully implemented +corrected=yes + +#Maximum number of stop codons allowed per alignment (i.e., considered as errors) +maxstop=5 + #-------------------------------------------------------------------------------------------------------------------- # **** REFERENCE FILES **** #-------------------------------------------------------------------------------------------------------------------- @@ -128,10 +135,13 @@ raxmlperjob=20 #whether mapping using bowtie2 should be done (yes/no) mapping=yes +#consensus calling software (kindel/ococo) +conscall=kindel + #minimum site coverage for SNP calling (N will be in consensus for sites with lower coverage) mincov=2 -#majority threshold for consensus calling (0-1) +#majority threshold for consensus calling (0-1) - not working in OCOCO! majthres=0.51 #--------------------------------------------------------------------------------------------------------------------