diff --git a/dataAnalysis/ComparativeGenomics/OrthofinderSynteny_Update.md b/dataAnalysis/ComparativeGenomics/OrthofinderSynteny_Update.md index 9a6f286..91e64d0 100644 --- a/dataAnalysis/ComparativeGenomics/OrthofinderSynteny_Update.md +++ b/dataAnalysis/ComparativeGenomics/OrthofinderSynteny_Update.md @@ -7,6 +7,9 @@ header: --- # Finding orthology and synteny with orthofinder, iadhore, and circos. + +Here we will be using Orthofinder to identify orthologous genes across three species of Abalone. Orthofinder output will be used as input for i-adhore to identify long-term ancestral synteny using homeologous gene order. This information will then be converted needed to create a Circos plot displaying synteny between two species. + Orthofinder is a robust software that can find orthologs very quickly and with very little input. We are going to take the output from orthofinder and use it as input for iAdhore. I adhore is a program that uses homeologous genes to identify longer-term ancestral synteny (in comparison to direct chromosomal alignments (Mummer, Minimap2)). With some modifications, the output from iadhore can be used to draw synteny plots with circos, or just to identify genomic feature overlap with Bedtools by creating a synteny gff. @@ -87,7 +90,7 @@ awk '$3=="mRNA"' WhiteAbGeneAnnots.gff3 |sed 's/;/\t/g' |awk 'substr($9,length($ ``` -### Run orthofinder on protein sets +# Run orthofinder on protein sets Orthofinder is pretty amazing. All that is needed is a new folder that has just your protein fastas. There are lots of options to play with, but I just used default for simplicity. (-M dendroblast, -S BLAST, -T fasttree, -I 1.5). ``` @@ -167,7 +170,7 @@ Because in this comparison I have 3 different species, I need to make a subset o /work/gif3/masonbrink/USDA/01_OrthofinderSynteny/03_iadhore/01_Black_Green grep -e "Black" -e "Green" ../Orthologs.list >BlackGreenOrthologues.list - +# i-adhore Black vs Green ### Create lists of files named by scaffold name The next step is to create files containing lists of genes with strand orientation. BEWARE, the genes/proteins must have the EXACT same names as the genes in your "Orthologs.list" file. Iadhore will throw errors if your two species have scaffolds with the exact same names. You will also run into errors if you did not eliminate all non-primary protein isoforms from your Orthofinder run. ``` @@ -327,6 +330,8 @@ Green_100 subject/Green_100.lst Green_1011 subject/Green_1011.lst etc. +NOTE THAT YOU NEED TO REMOVE ALL COMMENTS FROM iadhore.ini, IF YOU ARE TO USE IT. + #Our ortholog list made above blast_table=BlackGreenOrthologues.list #This is the type of orthologs we are providing, and is typically the easiest to provide. @@ -361,51 +366,255 @@ If you get a gene missing from blast table error, most likely you either have a If iadhore is successful, you will generate an output folder with some informative files like "multiplicons.txt" and "segments.txt" ``` -### Time to put it into circos! -Circos is just a good visualization tool for synteny, which I find to be more informative than dot plots. Circos can go much further than dot plots by allowing other features to be plotted alongside the synteny, for example: gene density, repeat density, etc) +### Time to put it into circos! Green vs Black +Circos is just a good visualization tool for synteny, which I find to be more informative than dot plots. Circos can go much further than dot plots by allowing other features to be plotted alongside the synteny, for example: gene density, repeat density, snp density, etc) ``` -#I create a new folder above the 02_iadhore folder. Again, some of my commands depend on this. -#/work/GIF/remkv6/USDA/15_OrthoFinderSynteny/03_circos - +/work/gif3/masonbrink/USDA/01_OrthofinderSynteny/05_Circos/01_BlackGreen #Softlink all relevant files:(genome, GFF, segments.txt) -ln -s ../GCF_000005845.2_ASM584v2_genomic.fna -ln -s ../ GCF_000249815.1_ASM24981v2_genomic.fna -ln -s ../02_iadhore/output/segments.txt -ln -s ../ModGCF_000249815.1_ASM24981v2_genomic.gff -ln -s ../GCF_000005845.2_ASM584v2_genomic.gff +ln -s ../../03_iadhore/01_Black_Green/output/segments.txt +ln -s ../../01_GenomicResources/PrimaryGreenAbGeneAnnots.gff3 +ln -s ../../01_GenomicResources/PrimaryBlackAbGeneAnnots.gff3 +ln -s ../../01_GenomicResources/RenamedBlackAbGenome.fasta +ln -s ../../01_GenomicResources/RenamedGreenAbGenome.fasta + + +# The five scripts below will just work if you change 3 things. 1. change "GreenAbalone" to whatever you named your second genome in your iadhore.ini file. 2. change PrimaryGreenAbGeneAnnots.gff3 to the gff that is associated with the second genome name in your iadhore.ini file. 3. change the PrimaryBlackAbGeneAnnots.gff3 to the gff you created that is associated with the first genome name in your iadhore.ini file. +Essentially what is happening below is that you swapping columns in segments.txt until you get pathogenic all on one side. Then I extract the 5' position for the 5' syntenic gene and the 3' position for the 3' syntenic gene for each genome + +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else{print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5) {print $1}' |xargs -I xx awk '$9=="'xx'"' PrimaryBlackAbGeneAnnots.gff3|awk '{if($7=="+") {print $1,$4} else {print $1,$5}}' >Col1.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $2}' |xargs -I xx awk '$9=="'xx'"' PrimaryBlackAbGeneAnnots.gff3|awk '{if($7=="+") {print $4} else {print $5}}' >Col2.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $3}' |xargs -I xx awk '$9=="'xx'"' PrimaryGreenAbGeneAnnots.gff3|awk '{if($7=="+") {print $1,$4} else {print $1,$5}}' >Col3.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $4}' |xargs -I xx awk '$9=="'xx'"' PrimaryGreenAbGeneAnnots.gff3|awk '{if($7=="+") {print $4} else {print $5}}' >Col4.list + +#This concatenates each of the locations and places them so the start is always before the end. +paste Col1.list Col2.list Col3.list Col4.list |awk '{if ($2>$3) {print $1,$3,$2,$4,$5,$6} else {print $0}}' |awk '{if ($5>$6) {print $1,$2,$3,$4,$6,$5} else {print $0}}' |tr "\t" " " >SyntenicRibbons.conf + + +#Here is the SyntenicRibbons.conf file #scaffold position position,scaffold, position, position +#These two commands are essentially extracting the scaffold lengths in your genome and putting them in the proper format. +bioawk -c fastx '{print $name,length($seq)}' RenamedBlackAbGenome.fasta |awk '{print "chr","-",$1,$1,"0",$2,"blue"}' >RenamedBlackAbKaryotype.conf + +bioawk -c fastx '{print $name,length($seq)}' RenamedGreenAbGenome.fasta |awk '{print "chr","-",$1,$1,"0",$2,"green"}' >RenamedGreenAbKaryotype.conf + + + +#The next six scripts below are essentially extracting the scaffolds that have some synteny. You dont want to display those scaffolds that do not have any information, right?. Make sure you have the proper column for each extraction. Remember column 1 is one species' scaffolds, and column 4 is the other species' scaffolds +awk '{print $1}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' RenamedBlackAbKaryotype.conf >>tmpKaryotype.conf1";done >RenamedBlackAbKaryotype.sh +sh RenamedBlackAbKaryotype.sh + +awk '{print $4}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' RenamedGreenAbKaryotype.conf >>tmpKaryotype.conf2";done >RenamedGreenAbKaryotype.sh +sh RenamedGreenAbKaryotype.sh +cat <(sort tmpKaryotype.conf1 |uniq) <(sort tmpKaryotype.conf2 |uniq) >karyotype.conf + + +#Now lets reduce the number of times the circos synteny plot lines overlap, so it is more pleasing to the eye. +#I just download this tool everytime because it is small and easier than finding the original circos installation directory + +wget http://circos.ca/distribution/circos-tools-0.22.tgz + tar -zxvf circos-tools-0.22.tgz + + #We will use the tmpKaryotype.conf1 file to get the scaffold names that we want grouped together. You can also use tmpKaryotype.conf2 to do this. I would suggest using the file that is the smallest. + #the below script generates the command. + sort tmpKaryotype.conf1 |uniq|awk '{print $3}' |tr "\n" "," |sed 's/.$//' |awk '{print "circos-tools-0.22/tools/orderchr/bin/orderchr -links SyntenicRibbons.conf -karyotype karyotype.conf - "$0" -static_rx "$0 }' |less + + + #it runs something like this +circos-tools-0.22/tools/orderchr/bin/orderchr -links SyntenicRibbons.conf -karyotype karyotype.conf - Black_1022,Black_1074,Black_1076,Black_10,Black_119,Black_11,Black_127,Black_12,Black_1327,Black_137,Black_13,Black_144,Black_1468,Black_1469,Black_14,Black_154,Black_158,Black_15,Black_16,Black_175,Black_17,Black_18,Black_195,Black_19,Black_1,Black_213,Black_22,Black_239,Black_25,Black_2,Black_300,Black_302,Black_328,Black_335,Black_350,Black_36,Black_372,Black_373,Black_381,Black_387,Black_393,Black_3,Black_427,Black_476,Black_488,Black_496,Black_4,Black_544,Black_546,Black_564,Black_573,Black_5,Black_60,Black_628,Black_654,Black_662,Black_6,Black_713,Black_72,Black_760,Black_7,Black_834,Black_879,Black_8,Black_967,Black_9 -static_rx Black_1022,Black_1074,Black_1076,Black_10,Black_119,Black_11,Black_127,Black_12,Black_1327,Black_137,Black_13,Black_144,Black_1468,Black_1469,Black_14,Black_154,Black_158,Black_15,Black_16,Black_175,Black_17,Black_18,Black_195,Black_19,Black_1,Black_213,Black_22,Black_239,Black_25,Black_2,Black_300,Black_302,Black_328,Black_335,Black_350,Black_36,Black_372,Black_373,Black_381,Black_387,Black_393,Black_3,Black_427,Black_476,Black_488,Black_496,Black_4,Black_544,Black_546,Black_564,Black_573,Black_5,Black_60,Black_628,Black_654,Black_662,Black_6,Black_713,Black_72,Black_760,Black_7,Black_834,Black_879,Black_8,Black_967,Black_9 +calculating round 0 +report round 0 minimize init 141285 final 26320 change 81.37% +calculating round 1 +report round 1 minimize init 26320 final 9978 change 62.09% +calculating round 2 +report round 2 minimize init 9978 final 8770 change 12.11% +calculating round 3 +report round 3 minimize init 8770 final 8770 change 0.00% +scorereport init 141285 final 8770 change 93.79% +chromosomes_order = Green_6,Black_7,Black_13,Black_119,Black_158,Black_127,Black_6,Green_7,Green_8,Green_10,Black_14,Black_628,Black_19,Black_654,Black_239,Black_573,Black_662,Black_25,Black_302,Green_9,Black_9,Black_8,Black_1076,Black_1,Black_17,Black_1074,Black_36,Green_17,Black_3,Black_300,Black_335,Black_328,Black_387,Black_381,Black_496,Black_1469,Black_350,Black_5,Green_5,Black_879,Black_18,Green_18,Green_13,Black_11,Black_4,Black_213,Black_427,Black_12,Black_373,Black_175,Black_488,Black_476,Green_14,Black_1022,Black_60,Black_760,Black_72,Black_834,Black_22,Black_195,Black_154,Green_4,Green_3,Green_454,Black_1327,Green_719,Black_546,Black_544,Green_1,Green_2,Green_11,Black_137,Black_15,Green_15,Black_10,Black_393,Black_967,Black_372,Black_16,Green_16,Black_144,Black_2,Black_713,Black_1468,Black_564,Green_12 + + + + #the last bit is what we want chromosomes_order = ..... +``` +### Paste in the remaining Circos files and run Circos! +``` + +#Essentially you can copy and paste the four files listed below: circos.conf, ticks.conf, bands.conf, and ideogram.conf. However, not every genome is the same size as an E.coli genome, so a few things can be changed. +1. In ideogram.conf you can change "radius = 0.84r". This will alter how far out your scaffold names will display +2. In circos.conf you can change "chromosomes_units = 100000" to a larger or smaller number to shrink or enlarge how the chromosomes display +3. In ticks.conf you can change "multiplier = 1e-5" to decide how often to label your ticks in your circos chart. + +#circos.conf +############################################################################# +karyotype = ./karyotype.conf +chromosomes_units = 100000 + <> + <> + <> + + + + file=SyntenicRibbons.conf + radius = 0.94r + bezier_radius = 0.1r + thickness = 1 + ribbon = yes + + + + + + + <> +angle_offset* = -46 + +<> + <> +chromosomes_order = Green_6,Black_7,Black_13,Black_119,Black_158,Black_127,Black_6,Green_7,Green_8,Green_10,Black_14,Black_628,Black_19,Black_654,Black_239,Black_573,Black_662,Black_25,Black_302,Green_9,Black_9,Black_8,Black_1076,Black_1,Black_17,Black_1074,Black_36,Green_17,Black_3,Black_300,Black_335,Black_328,Black_387,Black_381,Black_496,Black_1469,Black_350,Black_5,Green_5,Black_879,Black_18,Green_18,Green_13,Black_11,Black_4,Black_213,Black_427,Black_12,Black_373,Black_175,Black_488,Black_476,Green_14,Black_1022,Black_60,Black_760,Black_72,Black_834,Black_22,Black_195,Black_154,Green_4,Green_3,Green_454,Black_1327,Green_719,Black_546,Black_544,Green_1,Green_2,Green_11,Black_137,Black_15,Green_15,Black_10,Black_393,Black_967,Black_372,Black_16,Green_16,Black_144,Black_2,Black_713,Black_1468,Black_564,Green_12 + + +############################################################################# + +ticks.conf +############################################################################### +show_ticks = yes +show_tick_labels = no + + radius = 1r + color = black + thickness = 10p + multiplier = 1e-7 + format = %d + + spacing = 100u + size = 25p + show_label = yes + label_size = 25p + label_offset = 10p + format = %d + + + + +############################################################################### + +bands.conf +############################################################################### + + show_bands = yes + fill_bands = yes + band_transparency = 4 + +############################################################################### + +ideogram.conf +############################################################################### + + + default = 0.006r + break = 30u + axis_break_at_edge = yes + axis_break = yes + axis_break_style = 2 + + stroke_color = black + thickness = 0.45r + stroke_thickness = 2p + + + stroke_color = black + stroke_thickness = 5p + thickness = 4r + + + radius = 0.74r + thickness = 80p + fill = yes + stroke_color = white + stroke_thickness = 4p + fill_color = black + show_label = yes + label_font = bold + label_size = 16 + label_parallel = no + + label_radius = dims(ideogram,radius_outer) + 0.06r + +############################################################################### + +#This last file I always copy to the working directory, just in case I have more than 200 chromosomes I want to display. +If so, change this line +cp /opt/rit/el9/20230413/app/linux-rhel9-x86_64_v3/gcc-11.2.1/circos-0.69-6-learnz7tfqrflpcu57fbdtzxc47cii2a/lib/circos/etc/housekeeping.conf . +"max_ideograms = 200" + + +#All that is left is to run circos! + +circos -conf circos.conf +``` + + + +![Circos](../../assets/BlackVGreen.png) + + -#All I am doing here is creating a column in the gff that has only the protein name. This way I can use grep -w to get exact matches later on. -sed 's/;/\t/g' GCF_000005845.2_ASM584v2_genomic.gff |sed 's/Protein Homology/Protein_Homology/g' |sed 's/Name=//g' |awk '$3=="CDS"' >5845GrepMod.gff -sed 's/;/\t/g' ModGCF_000249815.1_ASM24981v2_genomic.gff |sed 's/Protein Homology/Protein_Homology/g' |sed 's/Name=//g' |awk '$3=="CDS"' >24981GrepMod.gff -# The five scripts below will just work if you change 3 things. 1. change "pathenogenic" to whatever you named your second genome in your iadhore.ini file. 2. change 24891GrepMod.gff to the grepMod gff that is associated with the second genome name in your iadhore.ini file. 3. change the 5845GrepMod.gff to the grepMod gff you created that is associated with the first genome name in your iadhore.ini file. + + + + + + + + + + + + +### Time to put it into circos! Black vs White +Circos is just a good visualization tool for synteny, which I find to be more informative than dot plots. Circos can go much further than dot plots by allowing other features to be plotted alongside the synteny, for example: gene density, repeat density, snp density, etc) +``` + +/work/gif3/masonbrink/USDA/01_OrthofinderSynteny/05_Circos/02_Black_White + +#Softlink all relevant files:(genome, GFF, segments.txt) +ln -s ../../03_iadhore/01_Black_Green/output/segments.txt +ln -s ../../01_GenomicResources/PrimaryWhiteAbGeneAnnots.gff3 +ln -s ../../01_GenomicResources/PrimaryBlackAbGeneAnnots.gff3 +ln -s ../../01_GenomicResources/RenamedBlackAbGenome.fasta +ln -s ../../01_GenomicResources/RenamedAbGenome.fasta + + +# The five scripts below will just work if you change 3 things. 1. change "GreenAbalone" to whatever you named your second genome in your iadhore.ini file. 2. change PrimaryGreenAbGeneAnnots.gff3 to the gff that is associated with the second genome name in your iadhore.ini file. 3. change the PrimaryBlackAbGeneAnnots.gff3 to the gff you created that is associated with the first genome name in your iadhore.ini file. Essentially what is happening below is that you swapping columns in segments.txt until you get pathogenic all on one side. Then I extract the 5' position for the 5' syntenic gene and the 3' position for the 3' syntenic gene for each genome -less segments.txt |awk 'NR>1' |sed 'N;s/\n/ /' |awk '$3!=$10 {print $1,$2,$3,$4,$5,$6,$7"\n"$8,$9,$10,$11,$12,$13,$14}' |awk '{if(NR%2) {print "#"$3,$4,$5,$6}else {print $3,$4,$5,$6}}' |tr "\n" " " |tr "#" "\n" |awk '{if($5=="pathenogenic") {print $5,$6,$7,$8,$1,$2,$3,$4} else {print $1,$2,$3,$4,$5,$6,$7,$8}}' |awk '{print $3}' |sed '/^$/d' |while read line; do grep -w $line 24981GrepMod.gff; done |awk '{if($7=="+") {print $5} else {print $4}}' >Col3 -less segments.txt |awk 'NR>1' |sed 'N;s/\n/ /' |awk '$3!=$10 {print $1,$2,$3,$4,$5,$6,$7"\n"$8,$9,$10,$11,$12,$13,$14}' |awk '{if(NR%2) {print "#"$3,$4,$5,$6}else {print $3,$4,$5,$6}}' |tr "\n" " " |tr "#" "\n" |awk '{if($5=="pathenogenic") {print $5,$6,$7,$8,$1,$2,$3,$4} else {print $1,$2,$3,$4,$5,$6,$7,$8}}' |awk '{print $4}' |sed '/^$/d' |while read line; do grep -w $line 24981GrepMod.gff; done |awk '{if($7=="+") {print $4} else {print $5}}' >Col4 -less segments.txt |awk 'NR>1' |sed 'N;s/\n/ /' |awk '$3!=$10 {print $1,$2,$3,$4,$5,$6,$7"\n"$8,$9,$10,$11,$12,$13,$14}' |awk '{if(NR%2) {print "#"$3,$4,$5,$6}else {print $3,$4,$5,$6}}' |tr "\n" " " |tr "#" "\n" |awk '{if($5=="pathenogenic") {print $5,$6,$7,$8,$1,$2,$3,$4} else {print $1,$2,$3,$4,$5,$6,$7,$8}}' |awk '{print $7}' |sed '/^$/d' |while read line; do grep -w $line 5845GrepMod.gff; done |awk '{if($7=="+") {print $5} else {print $4}}' >Col7 -less segments.txt |awk 'NR>1' |sed 'N;s/\n/ /' |awk '$3!=$10 {print $1,$2,$3,$4,$5,$6,$7"\n"$8,$9,$10,$11,$12,$13,$14}' |awk '{if(NR%2) {print "#"$3,$4,$5,$6}else {print $3,$4,$5,$6}}' |tr "\n" " " |tr "#" "\n" |awk '{if($5=="pathenogenic") {print $5,$6,$7,$8,$1,$2,$3,$4} else {print $1,$2,$3,$4,$5,$6,$7,$8}}' |awk '{print $8}' |sed '/^$/d' |while read line; do grep -w $line 5845GrepMod.gff; done |awk '{if($7=="+") {print $4} else {print $5}}' >Col8 -#This last step adds the scaffold names to the gene positions extracted above. -less segments.txt |awk 'NR>1' |sed 'N;s/\n/ /' |awk '$3!=$10 {print $1,$2,$3,$4,$5,$6,$7"\n"$8,$9,$10,$11,$12,$13,$14}' |awk '{if(NR%2) {print "#"$3,$4,$5,$6}else {print $3,$4,$5,$6}}' |tr "\n" "\t" |tr "#" "\n" |awk '{if($5=="pathenogenic") {print $5,$6,$7,$8,$1,$2,$3,$4} else {print $1,$2,$3,$4,$5,$6,$7,$8}}' |awk '{print $2,$6}' |awk 'NR>1' |paste - Col3 Col4 Col7 Col8 |awk '{print $1,$3,$4,$2,$5,$6}' >SyntenicRibbons.conf +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else{print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5) {print $1}' |xargs -I xx awk '$9=="'xx'"' PrimaryBlackAbGeneAnnots.gff3|awk '{if($7=="+") {print $1,$4} else {print $1,$5}}' >Col1.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $2}' |xargs -I xx awk '$9=="'xx'"' PrimaryBlackAbGeneAnnots.gff3|awk '{if($7=="+") {print $4} else {print $5}}' >Col2.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $3}' |xargs -I xx awk '$9=="'xx'"' PrimaryGreenAbGeneAnnots.gff3|awk '{if($7=="+") {print $1,$4} else {print $1,$5}}' >Col3.list +less segments.txt |awk 'NR>1{print $2,$5,$6}' |awk '{if(NR%2) {print "#"$1,$2,$3}else {print $1,$2,$3}}' |tr "\n" "\t" |sed 's/\t#/\n/g' |awk '{print $2,$3,$5,$6}' |awk '{if(substr($1,1,5)=="Black") {print $1,$2,$3,$4} else {print $3,$4,$1,$2}}' |awk 'substr($1,1,5)!=substr($3,1,5){print $4}' |xargs -I xx awk '$9=="'xx'"' PrimaryGreenAbGeneAnnots.gff3|awk '{if($7=="+") {print $4} else {print $5}}' >Col4.list + +#This concatenates each of the locations and places them so the start is always before the end. +paste Col1.list Col2.list Col3.list Col4.list |awk '{if ($2>$3) {print $1,$3,$2,$4,$5,$6} else {print $0}}' |awk '{if ($5>$6) {print $1,$2,$3,$4,$6,$5} else {print $0}}' |tr "\t" " " >SyntenicRibbons.conf + #Here is the SyntenicRibbons.conf file #scaffold position position,scaffold, position, position #These two commands are essentially extracting the scaffold lengths in your genome and putting them in the proper format. -bioawk -c fastx '{print $name,length($seq)}' GCF_000249815.1_ASM24981v2_genomic.fna |awk '{print "chr","-",$1,$1,"0",$2,"blue"}' >PathenogenicKaryotype.conf +bioawk -c fastx '{print $name,length($seq)}' RenamedBlackAbGenome.fasta |awk '{print "chr","-",$1,$1,"0",$2,"blue"}' >RenamedBlackAbKaryotype.conf -bioawk -c fastx '{print $name,length($seq)}' GCF_000005845.2_ASM584v2_genomic.fna |sed 's/\.3//g' |awk '{print "chr","-",$1,$1,"0",$2,"green"}' >NonpathKaryotype.conf +bioawk -c fastx '{print $name,length($seq)}' RenamedGreenAbGenome.fasta |awk '{print "chr","-",$1,$1,"0",$2,"green"}' >RenamedGreenAbKaryotype.conf #The next six scripts below are essentially extracting the scaffolds that have some synteny. You dont want to display those scaffolds that do not have any information, right?. Make sure you have the proper column for each extraction. Remember column 1 is one species' scaffolds, and column 4 is the other species' scaffolds -awk '{print $4}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' NonpathKaryotype.conf >>tmpKaryotype.conf1";done >NonpathKaryotype.sh -sh NonpathKaryotype.sh +awk '{print $1}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' RenamedBlackAbKaryotype.conf >>tmpKaryotype.conf1";done >RenamedBlackAbKaryotype.sh +sh RenamedBlackAbKaryotype.sh -awk '{print $1}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' PathenogenicKaryotype.conf >>tmpKaryotype.conf2";done >PathenogenicKaryotype.sh -sh PathenogenicKaryotype.sh +awk '{print $4}' SyntenicRibbons.conf|while read line; do echo "awk '\$3==\""$line"\"' RenamedGreenAbKaryotype.conf >>tmpKaryotype.conf2";done >RenamedGreenAbKaryotype.sh +sh RenamedGreenAbKaryotype.sh cat <(sort tmpKaryotype.conf1 |uniq) <(sort tmpKaryotype.conf2 |uniq) >karyotype.conf @@ -421,17 +630,18 @@ wget http://circos.ca/distribution/circos-tools-0.22.tgz #it runs something like this - circos-tools-0.22/tools/orderchr/bin/orderchr -links SyntenicRibbons.conf -karyotype karyotype.conf - NC_000913 -static_rx NC_000913 - calculating round 0 - report round 0 minimize init 9066 final 5848 change 35.50% - calculating round 1 - report round 1 minimize init 5848 final 874 change 85.05% - calculating round 2 - report round 2 minimize init 874 final 459 change 47.48% - calculating round 3 - report round 3 minimize init 459 final 365 change 20.48% - scorereport init 9066 final 365 change 95.97% - chromosomes_order = NC_000913,NZ_AIGG01000090.1,NZ_AIGG01000089.1,NZ_AIGG01000087.1,NZ_AIGG01000086.1,NZ_AIGG01000083.1,NZ_AIGG01000069.1,NZ_AIGG01000081.1,NZ_AIGG01000082.1,NZ_AIGG01000096.1,NZ_AIGG01000074.1,NZ_AIGG01000080.1,NZ_AIGG01000079.1,NZ_AIGG01000078.1,NZ_AIGG01000075.1,NZ_AIGG01000072.1,NZ_AIGG01000073.1,NZ_AIGG01000070.1,NZ_AIGG01000035.1,NZ_AIGG01000068.1,NZ_AIGG01000067.1,NZ_AIGG01000066.1,NZ_AIGG01000060.1,NZ_AIGG01000057.1,NZ_AIGG01000029.1,NZ_AIGG01000054.1,NZ_AIGG01000053.1,NZ_AIGG01000052.1,NZ_AIGG01000091.1,NZ_AIGG01000051.1,NZ_AIGG01000050.1,NZ_AIGG01000045.1,NZ_AIGG01000039.1,NZ_AIGG01000038.1,NZ_AIGG01000034.1,NZ_AIGG01000028.1,NZ_AIGG01000032.1,NZ_AIGG01000027.1,NZ_AIGG01000025.1,NZ_AIGG01000018.1,NZ_AIGG01000013.1,NZ_AIGG01000011.1,NZ_AIGG01000008.1,NZ_AIGG01000006.1,NZ_AIGG01000005.1,NZ_AIGG01000003.1,NZ_AIGG01000095.1,NZ_AIGG01000004.1,NZ_AIGG01000001.1 +circos-tools-0.22/tools/orderchr/bin/orderchr -links SyntenicRibbons.conf -karyotype karyotype.conf - Black_1022,Black_1074,Black_1076,Black_10,Black_119,Black_11,Black_127,Black_12,Black_1327,Black_137,Black_13,Black_144,Black_1468,Black_1469,Black_14,Black_154,Black_158,Black_15,Black_16,Black_175,Black_17,Black_18,Black_195,Black_19,Black_1,Black_213,Black_22,Black_239,Black_25,Black_2,Black_300,Black_302,Black_328,Black_335,Black_350,Black_36,Black_372,Black_373,Black_381,Black_387,Black_393,Black_3,Black_427,Black_476,Black_488,Black_496,Black_4,Black_544,Black_546,Black_564,Black_573,Black_5,Black_60,Black_628,Black_654,Black_662,Black_6,Black_713,Black_72,Black_760,Black_7,Black_834,Black_879,Black_8,Black_967,Black_9 -static_rx Black_1022,Black_1074,Black_1076,Black_10,Black_119,Black_11,Black_127,Black_12,Black_1327,Black_137,Black_13,Black_144,Black_1468,Black_1469,Black_14,Black_154,Black_158,Black_15,Black_16,Black_175,Black_17,Black_18,Black_195,Black_19,Black_1,Black_213,Black_22,Black_239,Black_25,Black_2,Black_300,Black_302,Black_328,Black_335,Black_350,Black_36,Black_372,Black_373,Black_381,Black_387,Black_393,Black_3,Black_427,Black_476,Black_488,Black_496,Black_4,Black_544,Black_546,Black_564,Black_573,Black_5,Black_60,Black_628,Black_654,Black_662,Black_6,Black_713,Black_72,Black_760,Black_7,Black_834,Black_879,Black_8,Black_967,Black_9 +calculating round 0 +report round 0 minimize init 141285 final 26320 change 81.37% +calculating round 1 +report round 1 minimize init 26320 final 9978 change 62.09% +calculating round 2 +report round 2 minimize init 9978 final 8770 change 12.11% +calculating round 3 +report round 3 minimize init 8770 final 8770 change 0.00% +scorereport init 141285 final 8770 change 93.79% +chromosomes_order = Green_6,Black_7,Black_13,Black_119,Black_158,Black_127,Black_6,Green_7,Green_8,Green_10,Black_14,Black_628,Black_19,Black_654,Black_239,Black_573,Black_662,Black_25,Black_302,Green_9,Black_9,Black_8,Black_1076,Black_1,Black_17,Black_1074,Black_36,Green_17,Black_3,Black_300,Black_335,Black_328,Black_387,Black_381,Black_496,Black_1469,Black_350,Black_5,Green_5,Black_879,Black_18,Green_18,Green_13,Black_11,Black_4,Black_213,Black_427,Black_12,Black_373,Black_175,Black_488,Black_476,Green_14,Black_1022,Black_60,Black_760,Black_72,Black_834,Black_22,Black_195,Black_154,Green_4,Green_3,Green_454,Black_1327,Green_719,Black_546,Black_544,Green_1,Green_2,Green_11,Black_137,Black_15,Green_15,Black_10,Black_393,Black_967,Black_372,Black_16,Green_16,Black_144,Black_2,Black_713,Black_1468,Black_564,Green_12 + #the last bit is what we want chromosomes_order = ..... @@ -465,27 +675,28 @@ chromosomes_units = 100000 - <> + <> angle_offset* = -46 -<> +<> <> - chromosomes_order = NC_000913,NZ_AIGG01000090.1,NZ_AIGG01000089.1,NZ_AIGG01000087.1,NZ_AIGG01000086.1,NZ_AIGG01000083.1,NZ_AIGG01000069.1,NZ_AIGG01000081.1,NZ_AIGG01000082.1,NZ_AIGG01000096.1,NZ_AIGG01000074.1,NZ_AIGG01000080.1,NZ_AIGG01000079.1,NZ_AIGG01000078.1,NZ_AIGG01000075.1,NZ_AIGG01000072.1,NZ_AIGG01000073.1,NZ_AIGG01000070.1,NZ_AIGG01000035.1,NZ_AIGG01000068.1,NZ_AIGG01000067.1,NZ_AIGG01000066.1,NZ_AIGG01000060.1,NZ_AIGG01000057.1,NZ_AIGG01000029.1,NZ_AIGG01000054.1,NZ_AIGG01000053.1,NZ_AIGG01000052.1,NZ_AIGG01000091.1,NZ_AIGG01000051.1,NZ_AIGG01000050.1,NZ_AIGG01000045.1,NZ_AIGG01000039.1,NZ_AIGG01000038.1,NZ_AIGG01000034.1,NZ_AIGG01000028.1,NZ_AIGG01000032.1,NZ_AIGG01000027.1,NZ_AIGG01000025.1,NZ_AIGG01000018.1,NZ_AIGG01000013.1,NZ_AIGG01000011.1,NZ_AIGG01000008.1,NZ_AIGG01000006.1,NZ_AIGG01000005.1,NZ_AIGG01000003.1,NZ_AIGG01000095.1,NZ_AIGG01000004.1,NZ_AIGG01000001.1 +chromosomes_order = Green_6,Black_7,Black_13,Black_119,Black_158,Black_127,Black_6,Green_7,Green_8,Green_10,Black_14,Black_628,Black_19,Black_654,Black_239,Black_573,Black_662,Black_25,Black_302,Green_9,Black_9,Black_8,Black_1076,Black_1,Black_17,Black_1074,Black_36,Green_17,Black_3,Black_300,Black_335,Black_328,Black_387,Black_381,Black_496,Black_1469,Black_350,Black_5,Green_5,Black_879,Black_18,Green_18,Green_13,Black_11,Black_4,Black_213,Black_427,Black_12,Black_373,Black_175,Black_488,Black_476,Green_14,Black_1022,Black_60,Black_760,Black_72,Black_834,Black_22,Black_195,Black_154,Green_4,Green_3,Green_454,Black_1327,Green_719,Black_546,Black_544,Green_1,Green_2,Green_11,Black_137,Black_15,Green_15,Black_10,Black_393,Black_967,Black_372,Black_16,Green_16,Black_144,Black_2,Black_713,Black_1468,Black_564,Green_12 + ############################################################################# ticks.conf ############################################################################### show_ticks = yes -show_tick_labels = yes +show_tick_labels = no radius = 1r color = black thickness = 10p - multiplier = 1e-4 + multiplier = 1e-7 format = %d - spacing = 10u + spacing = 100u size = 25p show_label = yes label_size = 25p @@ -545,7 +756,7 @@ ideogram.conf If so, change this line "max_ideograms = 200" -cp /work/GIF/software/programs/circos/0.69-4/etc/housekeeping.conf . + cp /opt/rit/el9/20230413/app/linux-rhel9-x86_64_v3/gcc-11.2.1/circos-0.69-6-learnz7tfqrflpcu57fbdtzxc47cii2a/lib/circos/etc/housekeeping.conf . #All that is left is to run circos! @@ -553,7 +764,6 @@ cp /work/GIF/software/programs/circos/0.69-4/etc/housekeeping.conf . circos -conf circos.conf ``` -![Circos](../../assets/E.coliSynteny.png) --- [Table of contents](compGenomics_index.md) diff --git a/dataAnalysis/ComparativeGenomics/assets/BlackVGreencircos.png b/dataAnalysis/ComparativeGenomics/assets/BlackVGreencircos.png new file mode 100644 index 0000000..04eb7bb Binary files /dev/null and b/dataAnalysis/ComparativeGenomics/assets/BlackVGreencircos.png differ