From 05ac0bb7da5c38edaed76c8b3930233000b61536 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 12 Jun 2024 11:40:37 +0200 Subject: [PATCH 1/3] manage chevron in GFF start and stop --- ppanggolin/annotate/annotate.py | 36 +++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 659954c9..56fdfe38 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -601,6 +601,29 @@ def get_id_attribute(attributes_dict: dict) -> str: raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. " f"Not the case for file: {gff_file_path}") return element_id + + + def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, bool]: + """ + Checks for the presence of chevrons ('<' or '>') in the start and stop strings, removes them if present, + and converts the remaining parts to integers. + + :param start: The start string which may contain chevrons. + :param stop: The stop string which may contain chevrons. + + :return: A tuple containing the integer values of start and stop, and a boolean indicating if chevrons were present in either string. + """ + chevrons_present = '>' in start or '<' in start or '>' in stop or '<' in stop + + if chevrons_present: + start = int(start.replace('<', '').replace('>', '')) + stop = int(stop.replace('<', '').replace('>', '')) + else: + start = int(start) + stop = int(stop) + + return start, stop, chevrons_present + contig = None # initialize contig has_fasta = False @@ -646,8 +669,13 @@ def get_id_attribute(attributes_dict: dict) -> str: else: fields_gff = [el.strip() for el in line.split('\t')] attributes = get_gff_attributes(fields_gff) + pseudogene = False + start, stop, has_chevron = check_chevrons_in_start_and_stop(start=fields_gff[gff_start], stop=fields_gff[gff_end]) + if has_chevron: + pseudogene = True + if fields_gff[gff_type] == 'region': # keep region attributes to add them as metadata of genome and contigs # excluding some info as they are alredy contained in contig object. @@ -714,8 +742,8 @@ def get_id_attribute(attributes_dict: dict) -> str: "position":contig.number_of_genes, "product":product, "local_identifier":gene_id, - "start": int(fields_gff[gff_start]), - "stop": int(fields_gff[gff_end]), + "start": start, + "stop": stop, "ID": id_attribute} check_and_add_extra_gene_part(existing_gene, new_gene_info) @@ -728,7 +756,7 @@ def get_id_attribute(attributes_dict: dict) -> str: id_attr_to_gene_id[id_attribute] = gene # here contig is filled in order, so position is the number of genes already stored in the contig. - gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), + gene.fill_annotations(start=start, stop=stop, strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, position=contig.number_of_genes, product=product, local_identifier=gene_id, @@ -742,7 +770,7 @@ def get_id_attribute(attributes_dict: dict) -> str: rna_type = fields_gff[gff_type] rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4)) - rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), + rna.fill_annotations(start=start, stop=stop, strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, product=product, local_identifier=gene_id) rna.fill_parents(org, contig) From 81030ce7bf4156d1a129ad754a49b002245f0987 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 12 Jun 2024 15:08:40 +0200 Subject: [PATCH 2/3] add fake plasmid with chevron to test chevron handeling in GFF in CI --- .github/workflows/main.yml | 3 +++ ..._CP007132_with_manually_added_chevrons.gff.gz | Bin 0 -> 3199 bytes 2 files changed, 3 insertions(+) create mode 100644 testingDataset/GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 88088f66..52cd8f8c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -191,6 +191,9 @@ jobs: --genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \ --gff --table --cpu $NUM_CPUS + # projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF + ppanggolin projection --pangenome myannopang/pangenome.h5 --anno plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron + - name: testing write_genome_cmds shell: bash -l {0} run: | diff --git a/testingDataset/GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz b/testingDataset/GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz new file mode 100644 index 0000000000000000000000000000000000000000..4826efde791569fa06c4138ef68158426bad96c3 GIT binary patch literal 3199 zcmV-_41n_=iwFq(n`veM18{6%b8Ts4Urt(ILr^d4o9&O1q$NEF}#=gW`^Y9;UV|gv-j`b?fkIX-mNz`(`9+~>~8hn zkE@%@)y{VHK0bT(pL0K(%?{?v#q;@eK0VkyT9!{foSbbpZ`bcHu8wE3#cZ}e@t3bB zf0|F`v%|&Y`Q#6i+2q@or?0;K+h4vud-=`zY<9HVKUv?rTW#Z<(~CFL_09Tzy_w!_ zS8vys{ngF!XIq|JZ7x6Fo!?))|9qd@zVPGa{pp9FZa4QIR`=`6i>toT=6va$`sU(pHT`$L$P-LlEEc1qBMh-S|1weN`~1!*>>c(f?C(Dsg~ey-v-5n1 z*>kt~zSG2K=`)`niAMPK7mwibU~jQ_%o&T}jMulD9>R+_gvaK~59>GU`^|O)^!4WZ zCpfA5XFiF_@YN$;T7IE$E`L3o&z^l_ubY*xKDlyftyMSk=KV#D|E>0SPx`d{`r`h= zt(o-~Sk!9k@8ZAy+6&*rLw$MePkS}2>L12?Y|z(KEUPt^)OgXqDt)!#c^rnVVLgB%COY1OgFVw+l=j_RMrMY}21xN#g457NgbkHql? zQ~DpC?A^z>VR$bbZE`uxFz%`A#5FxEb}PCw?h!95jqH!d#S~#(fDx?Dh=#-Z7nSaX z2k9bgQycuw09szRKh|K5)ncvywn`uNeQ}Gp19u|)!X$&&;%e|piI0Ui*a2h_Q0@%5 z<03TMV?PJO znZyNNrElYrz)!^y&ZI`3BC!Lo9&xVNKdueu)}~a&E^#1MR&1QyL671dl`sJT!kx%> zz#<=Fl0*hKBwid$xFRijs1S7koDnd%7wI3@2Q`RQNm7VA@;C^PkyWNNpoIZy^mr_Q zLAXKnf!kULUb1W*=?|Jheu5tXF%qyRAK}^&hBycmZ%8t+CX$lLg)q>|)sh%`oOMBf zGOk|%4(1qM0+UV_z)Df7djX4&8WL-hO_HV!yE4nbH@U(KBZTc;YemegGL?v;Pdrh; zuM0ts;8c2s3?9|UlwqFCsrzRdmP!;VM}Z*Aa!JA>le=gNJV_r0wn;032vXWv39+Y> zHyi6nrIGM!uNWjn5sr~N@QO&N#0rU_aWO9qY_oEu$DO=k35eKB>5DpAMWXOtsWc|* zh!ZnTV#5bWQUI5{%bX4~3eQp9Ag&ozq!Lk&^mNvcL>CHOtZ3xKeo)Wg4mh38N*#NMS}^1z$vFq$KNK zm575iRUibB@&$*mP^gwsaya!GI9J$i$)31-CI}5@MN}%=Kwcz<0f4-cv{K|I{VQnE zDrfPf)Tk=4l#|U;A5cG$~tp)`4xO59Dp>W&t=;i=whJ_ozO) z8iI!_BbS6^Qgzgc(n~Bf)L7JTL4ZSJF-sl9r|>$nO2uX{C?5)&Jj)AmuBs3wg5(ADC!$W#Hp&W=KABgj z&xS!c(x@9!&GRI83WP$RRsqn}7@2fA-6fF3ceIx&3vUy)E|gZ!7(pjc15xW#*50iA zy<(`NYex6N24+{*)kX$qond8T;s*5^TPUO0qsnNM+~NR6yOHfuyP&&~97w^UI_>Ov z)ZGHxO{Bn3vsRhw9G2xq+GOn;V8>qOqHuf?%Aht$N(1{-d5vxoJ9sv@**jC+)iO+O z$|A3Ho`gwLKTc0lySU6QB4~03RA!b0_MWBzz)<=?Ck1eUU{T|oR4LKRTyZFNCX>RR z5h#?n962;K(vaU&4mP5Zy;CY1a2R!2;@A-@AIt#-OKNt32P}jj6k2JT3yAO;;5u3x ziAna_R4b4bmTJ*emZUga%{CYS&K?+ERj(9TMc5F`EHBySlPbYT6eD2YHi)S3T2<|S z8^a56h7CG5xs7!QpA~h>qtQ&qx~4Lt5YYX6K~c?tgMjE+&H1QN^3m}v1(%JWMVyDw z7mM-YDj=G2pdXjtIk=Jbq})}c(U+J+5Aem)Dl~Rdv$~lWIEb)_-71lh)q-w@Ivll9 zr87oal+e_A*$D9g(Tz%Ul|rYvKw{D4p^X|cC|q@$l!s$k4b>Y(gThkEYoxj*S)@Hw z3g~_%4zgh=maPDX71;_n$%3d?BVkgONM|%7?d3rk=S$dzu6g&AaMLl~;Vf6;MD#(X)wY#Mj@=5y znkBuFf^)uVDK%Iu)oqRWZXq6Abc@7gQl+yV`swVsn+9ePXZ8E3MNAdhl(k7fGGffs z3~hp{&_hm|Yhugks%u=W#Npk+`i0eA8c&rlXQX2c0_T3h_^?VKaTO9OARZCbW&}GW z9(zv9<{PfB9QT?=qcTPOA2CPo(ME4Xx+>`eG}x9stGY{ECkGi+Z>eETgJL4V2>m2_ z*621`0Dwf^r^bRN8C|mJ(K;x}r2R;~*^~EW(ZxzT8fhS9A5$!UgFrFKu8FdPLl(0nFeN&J%~qbe%@rG! zed+mVft_vP224zHp?`GMe zsxdOskUrxCu1Y-8%;uT`$Fk zE0w8r)Z|KoJajh%vui*@UK!+}U#HlXa1x+U;5}vrRU=ro@07mK=!%mBZPTb<$AE?^ zmA3{FRegzm1Q@JSQ$HKoF}^Cy2CmTWEgI!?+_XW$j~#BLE7XVhYMP l&z~NkNF$%KJz#(R^S4dEJgDoX)WHvu{{!(L&^o9j008AWCKCVv literal 0 HcmV?d00001 From 1153c1d267910c59813e030a3739f6485dcb36d0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 12 Jun 2024 15:23:59 +0200 Subject: [PATCH 3/3] fix plasmid path in CI --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 52cd8f8c..68d820f9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -192,7 +192,7 @@ jobs: --gff --table --cpu $NUM_CPUS # projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF - ppanggolin projection --pangenome myannopang/pangenome.h5 --anno plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron + ppanggolin projection --pangenome myannopang/pangenome.h5 --anno GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron - name: testing write_genome_cmds shell: bash -l {0}