From 0404aef983ecfcdcc887bcee08d985c59e56fb00 Mon Sep 17 00:00:00 2001 From: Nolan Woods Date: Thu, 23 Jun 2022 14:23:54 -0700 Subject: [PATCH] Handle slicing SeqRecords --- .idea/BioPython-Convert.iml | 2 +- .idea/misc.xml | 2 +- biopython_convert/JMESPathGen.py | 8 +++ biopython_convert/__init__.py | 4 +- test-data/outputs/jpath_slice | 88 ++++++++++++++++++++++++++++++++ tests/test_convert.py | 10 +++- 6 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 test-data/outputs/jpath_slice diff --git a/.idea/BioPython-Convert.iml b/.idea/BioPython-Convert.iml index 83413fb..ad8c5d0 100644 --- a/.idea/BioPython-Convert.iml +++ b/.idea/BioPython-Convert.iml @@ -4,7 +4,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 2c9b542..a9c5106 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/biopython_convert/JMESPathGen.py b/biopython_convert/JMESPathGen.py index 6f05c4c..2e4284a 100644 --- a/biopython_convert/JMESPathGen.py +++ b/biopython_convert/JMESPathGen.py @@ -29,6 +29,12 @@ # and https://github.com/jmespath/jmespath.py/issues/159 +class Options(jmespath.Options): + def __init__(self, dict_cls=None, custom_functions=None, custom_slice_types=None): + super().__init__(dict_cls, custom_functions) + self.custom_slice_types = custom_slice_types + + def compile(expression): return Parser().parse(expression) @@ -178,6 +184,8 @@ def visit_index(self, node, value, **kwargs): return super().visit_index(node, value) def visit_slice(self, node, value, **kwargs): + if self._options.custom_slice_types is not None and isinstance(value, self._options.custom_slice_types): + return value[slice(*node['children'])] return itertools.islice(value, *node['children']) def visit_multi_select_list(self, node, value, **kwargs): diff --git a/biopython_convert/__init__.py b/biopython_convert/__init__.py index eec2874..2a12886 100644 --- a/biopython_convert/__init__.py +++ b/biopython_convert/__init__.py @@ -27,6 +27,8 @@ stat_annotations = ['molecule_type', 'topology', 'data_file_division', 'date', 'accessions', 'sequence_version', 'gi', 'keywords', 'source', 'organism'] +JMESPathGenOptions = JMESPathGen.Options(custom_functions=JMESPathGen.ExtendedFunctions(), custom_slice_types=(SeqIO.SeqRecord,)) + usage = """\ Use: biopython.convert [-s] [-v] [-i] [-q JMESPath] input_file input_type output_file output_type \t-s Split records into seperate files @@ -192,7 +194,7 @@ def gentype(x): # Wrap input in JMESPath selector if provided if jpath: - input_records = JMESPathGen.search(jpath, gentype(input_records)) + input_records = JMESPathGen.search(jpath, gentype(input_records), JMESPathGenOptions) # Apply xform to both entire return value input_records = xform(input_records) diff --git a/test-data/outputs/jpath_slice b/test-data/outputs/jpath_slice new file mode 100644 index 0000000..7de779a --- /dev/null +++ b/test-data/outputs/jpath_slice @@ -0,0 +1,88 @@ +LOCUS NC_008563 2800 bp DNA UNK 01-JAN-1980 +DEFINITION Escherichia coli APEC O1, complete genome. +ACCESSION NC_008563 +VERSION NC_008563.1 +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + gene 117..2579 + /locus_tag="APECO1_RS00010" + /old_locus_tag="APECO1_1976" + CDS 117..2579 + /locus_tag="APECO1_RS00010" + /old_locus_tag="APECO1_1976" + /inference="COORDINATES: similar to AA + sequence:RefSeq:WP_005124053.1" + /note="Derived by automated computational analysis using + gene prediction method: Protein Homology." + /codon_start=1 + /transl_table=11 + /product="bifunctional aspartokinase I/homoserine + dehydrogenase I" + /protein_id="WP_001264707.1" + /translation="MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITN + HLVAMIEKTISGQDALPNISDAERIFAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHV + LHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLES + TVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADC + CEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCL + IKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMS + RARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAII + SVVGDGMRTLRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQM + LFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWLKNKHIDLRVCGVANSKALLTNVHGLN + LENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHVVT + PNKKANTSSMDYYHQLRYAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGI + LSGSLSYIFGKLDEGMSFSEATTLAREMGYTEPDPRDDLSGMDVARKLLILARETGREL + ELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVLRYVGNIDEDG + VCRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLR + TLSWKLGV" +ORIGIN + 1 accatcacca ttaccacagg taacggtgcg ggctgacgcg tacaggaaac acagaaaaaa + 61 gcccgcacct gacagtgcgg gctttttttt cgaccaaagg taacgaggta acaaccatgc + 121 gagtgttgaa gttcggcggt acatcagtgg caaatgcaga acgttttctg cgggttgccg + 181 atattctgga aagcaatgcc aggcaggggc aggtggcgac cgtcctctct gcccccgcca + 241 aaattaccaa ccatctggta gcgatgattg aaaaaaccat tagcggccaa gatgctttac + 301 ccaatatcag cgatgccgaa cgtatttttg ccgaacttct gacgggactc gccgccgccc + 361 agccgggatt tccgctggca caattgaaaa ctttcgtcga ccaggaattt gcccaaataa + 421 aacatgtcct gcatggcatt agtttgttgg ggcagtgccc ggatagcatc aacgctgcgc + 481 tgatttgccg tggcgagaaa atgtcgatcg ccattatggc cggcgtgtta gaagcgcgtg + 541 gtcacaacgt taccgttatc gatccggtcg aaaaactgct tgcagtgggg cattacctcg + 601 aatctaccgt tgatattgct gagtccaccc gccgtattgc ggcaagccgc attccggctg + 661 accacatggt gctgatggct ggtttcactg ccggtaatga aaaaggcgag ctggtggttc + 721 tgggacgcaa cggttccgac tactccgctg cggtgctggc ggcctgttta cgcgccgatt + 781 gttgcgagat ctggacggat gttgacggtg tttatacctg cgatccgcgt caggtgcccg + 841 atgcgaggtt gttgaagtcg atgtcctatc aggaagcgat ggagctttct tacttcggcg + 901 ctaaagttct tcacccccgc accatcaccc ccatcgccca gtttcagatc ccttgcctga + 961 ttaaaaatac cggaaatcct caagctccag gtacgctcat tggtgccagc cgtgatgaag + 1021 acgaattacc ggtcaagggc atttccaatc tgaataacat ggcaatgttc agcgtttccg + 1081 gcccggggat gaaagggatg gttggcatgg cggcgcgcgt ctttgcagcg atgtcacgcg + 1141 cccgtatttc cgtggtgctg attacgcaat catcttccga atacagtatc agtttctgcg + 1201 ttccgcaaag cgactgtgtg cgagctgaac gggcaatgca ggaagagttc tacctggaac + 1261 tgaaagaagg cttactggag ccgttggcgg tgacggaacg gctggccatt atctcggtgg + 1321 taggtgatgg tatgcgcacc ttacgtggga tctcggcgaa attctttgcc gcgctggccc + 1381 gcgccaatat caacattgtc gccattgctc agggatcttc tgaacgctca atctctgtcg + 1441 tggtcaataa cgatgatgcg accactggcg tgcgcgttac tcatcagatg ctgttcaata + 1501 ccgatcaggt tatcgaagtg tttgtgattg gcgtcggtgg cgttggcggt gcgctgctgg + 1561 agcaactgaa gcgtcagcaa agctggttga agaataaaca tatcgactta cgtgtctgcg + 1621 gtgttgctaa ctcgaaggca ctgctcacca atgtacatgg ccttaatctg gaaaactggc + 1681 aggaagaact ggcgcaagcc aaagagccgt ttaatctcgg gcgcttaatt cgcctcgtga + 1741 aagaatatca tctgctgaac ccggtcattg ttgactgtac ttccagccag gcagtggcgg + 1801 atcaatatgc cgacttcctg cgcgaaggtt tccacgttgt tacgccgaac aaaaaggcca + 1861 acacctcgtc gatggattac taccatcagt tgcgttatgc ggcggaaaaa tcgcggcgta + 1921 aattcctcta tgacaccaac gttggggctg gattaccggt tatcgagaac ctgcaaaatc + 1981 tgctcaatgc tggtgatgaa ttgatgaagt tctccggcat tctttcaggt tcgctttctt + 2041 atatcttcgg caagttagac gaaggcatga gtttctccga ggcgaccaca ctggcgcggg + 2101 aaatgggtta taccgaaccg gacccgcgag atgatctttc tggtatggat gtggcgcgta + 2161 agctattgat tctcgctcgt gaaacgggac gtgaactgga gctggcggat attgaaattg + 2221 aacctgtgct gcccgcagag tttaacgccg agggtgatgt cgccgctttt atggcgaatc + 2281 tgtcacagct cgacgatctc tttgccgcgc gtgtggcgaa ggcccgtgat gaaggaaaag + 2341 ttttgcgcta tgttggcaat attgatgaag atggcgtctg ccgcgtgaag attgccgaag + 2401 tggatggtaa tgatccgctg ttcaaagtga aaaatggcga aaacgccctg gccttctata + 2461 gccactatta tcagccgctg ccgttggtac tgcgcggata tggtgcgggc aatgacgtta + 2521 cagctgccgg tgtctttgct gatctgctac gtaccctctc atggaagtta ggagtctgac + 2581 atggttaaag tttatgcccc ggcttccagt gccaatatga gcgtcgggtt tgatgtgctc + 2641 ggggcggcgg tgacacctgt tgatggtgca ttgctcggag atgtagtcac ggttgaggcg + 2701 gcagagacat tcagtctcaa caacctcgga cgctttgccg ataagctgcc gtcagagcca + 2761 cgggaaaata tcgtttatca gtgctgggag cgtttttgcc +// diff --git a/tests/test_convert.py b/tests/test_convert.py index 0b9b3cb..ee291d6 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -172,4 +172,12 @@ def test_creation2(self): seq: extract(seq, @), description: desc}) """) - self.compare_files(Path.joinpath(self.output_path, 'ffn'), output_path) \ No newline at end of file + self.compare_files(Path.joinpath(self.output_path, 'ffn'), output_path) + + def test_jpath_slice(self): + """ + Test slicing a SeqRecord + """ + output_path = Path(self.workdir.name, 'jpath_slice') + convert(self.input_path, self.input_type, output_path, 'genbank', jpath='[[0][200:3000]]') + self.compare_files(Path.joinpath(self.output_path, 'jpath_slice'), output_path) \ No newline at end of file