From 8d10a444660ae13224a98ae17eea23fc0b4c835a Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 18 Feb 2022 12:13:21 +0100 Subject: [PATCH] Add flanking sequence extraction first try --- .github/workflows/main.yml | 3 +++ extract_genes_abricate.py | 24 +++++++++++++++++++----- test/correct/correct_flanking.out | 16 ++++++++++++++++ test/genomes/flanking.fasta | 16 ++++++++++++++++ test/input/flanking.tsv | 2 ++ 5 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 test/correct/correct_flanking.out create mode 100644 test/genomes/flanking.fasta create mode 100644 test/input/flanking.tsv diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4f9a0cf..ece9453 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -50,6 +50,9 @@ jobs: # Testing for gene cluster python extract_genes_abricate.py -a test/input/test.tsv -g test/genomes -o test/out_genecluster -s .fasta --genecluster cmp test/correct/correct_genecluster.fasta test/out_genecluster/input_ncbi.out + # Testing with flanking sequences + python extract_genes_abricate.py -a test/input/flanking.tsv -g test/genomes -o test/out_flanking --flanking --flanking-bp 5 + cmp test/correct/correct_flanking.out test/out_flanking/flanking_blaCTX-M-15.out - name: Tests with combined ABRicate files from multiple genomes run: | # Normal gene extraction should work for files with multiple genomes diff --git a/extract_genes_abricate.py b/extract_genes_abricate.py index 48cf968..7a730c5 100644 --- a/extract_genes_abricate.py +++ b/extract_genes_abricate.py @@ -67,7 +67,8 @@ def process_sense(row, genome, output): logging.debug(f"Calling function process_sense") START = row['START'] - 1 END = row['END'] - writestring = ' '.join([str(row['SEQUENCE']), str(START), str(END)]) + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) + writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) logging.debug(f"{writestring} is written to NamedTemporaryFile") with tempfile.NamedTemporaryFile(mode='w+') as tf: tf.write(writestring) @@ -85,7 +86,8 @@ def process_antisense(row, genome, output): logging.debug(f"Calling function process_antisense") START = row['START'] - 1 END = row['END'] - writestring = ' '.join([str(row['SEQUENCE']), str(START), str(END)]) + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) + writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) logging.debug(f"{writestring} is written to NamedTemporaryFile") with tempfile.NamedTemporaryFile(mode='w+') as tf: tf.write(writestring) @@ -133,6 +135,12 @@ def update_record(record, combination): record.description = '' return record +def process_flanking(start, end, flanking, flanking_bp): + if flanking: + start = max(0, start - flanking_bp) + end = end + flanking_bp + return start, end + def main_genes(df, args): logging.debug(f"Calling function main_genes") combinations_passed = {} @@ -174,10 +182,9 @@ def main_genecluster(df, args): df_most_common_contig = df[df['SEQUENCE'] == most_common_contig] logging.debug(f"Find gene boundary extremes") START, END = find_gene_boundary_extremes(df_most_common_contig) - + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) logging.debug(f"Construct pandas Series mimicking an object from df.iterrows") - combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START, 'END': END}) - + combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START_updated, 'END': END_updated}) logging.debug(f"Check how many hits are sense or antisense") strand_series = df_most_common_contig['STRAND'].value_counts() if ('+' in strand_series) and ('-' in strand_series): @@ -229,6 +236,9 @@ def main(args): parser.add_argument("-s", "--suffix", dest="suffix", default=".fasta", help="Genome assembly file suffix (default: .fasta)") parser.add_argument("--genecluster", dest="genecluster", action="store_true", help="Extract all genes to a single fasta if located on a single contig (default: false)") parser.add_argument("--csv", dest="csv", action="store_true", help="Use this option if your ABRicate output file is comma-separated (default: parse as tab-separated file).") + parser.add_argument("--flanking", dest="flanking", action="store_true", help="Extract flanking sequences") + parser.add_argument("--flanking-bp", dest="flanking_bp", default=100, metavar="FLANKING LENGTH", type=int, help="Length of flanking sequence to extract in bp (default: 100)") + parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="Increase verbosity") args = parser.parse_args() @@ -241,5 +251,9 @@ def main(args): else: logging.basicConfig(level=logging.DEBUG) + # If user has specified --flanking-bp to another value than 100 bp, set flanking to True. Does not work if --flanking-bp 100 is specified + if (args.flanking_bp != 100) & (args.flanking == False): + logging.error(f"--flanking-bp is specified but command does include --flanking flag. Flanking sequences are not extracted.") + logging.debug(f"Executing main function") main(args) diff --git a/test/correct/correct_flanking.out b/test/correct/correct_flanking.out new file mode 100644 index 0000000..6dd26d0 --- /dev/null +++ b/test/correct/correct_flanking.out @@ -0,0 +1,16 @@ +>flanking_blaCTX-M-15 +GGGGGATGGTTAAAAAATCACTGCGCCAGTTCACGCTGATGGCGACGGCAACCGTCACGC +TGTTGTTAGGAAGTGTGCCGCTGTATGCGCAAACGGCGGACGTACAGCAAAAACTTGCCG +AATTAGAGCGGCAGTCGGGAGGCAGACTGGGTGTGGCATTGATTAACACAGCAGATAATT +CGCAAATACTTTATCGTGCTGATGAGCGCTTTGCGATGTGCAGCACCAGTAAAGTGATGG +CCGCGGCCGCGGTGCTGAAGAAAAGTGAAAGCGAACCGAATCTGTTAAATCAGCGAGTTG +AGATCAAAAAATCTGACCTTGTTAACTATAATCCGATTGCGGAAAAGCACGTCAATGGGA +CGATGTCACTGGCTGAGCTTAGCGCGGCCGCGCTACAGTACAGCGATAACGTGGCGATGA +ATAAGCTGATTGCTCACGTTGGCGGCCCGGCTAGCGTCACCGCGTTCGCCCGACAGCTGG +GAGACGAAACGTTCCGTCTCGACCGTACCGAGCCGACGTTAAACACCGCCATTCCGGGCG +ATCCGCGTGATACCACTTCACCTCGGGCAATGGCGCAAACTCTGCGGAATCTGACGCTGG +GTAAAGCATTGGGCGACAGCCAACGGGCGCAGCTGGTGACATGGATGAAAGGCAATACCA +CCGGTGCAGCGAGCATTCAGGCTGGACTGCCTGCTTCCTGGGTTGTGGGGGATAAAACCG +GCAGCGGTGGCTATGGCACCACCAACGATATCGCGGTGATCTGGCCAAAAGATCGTGCGC +CGCTGATTCTGGTCACTTACTTCACCCAGCCTCAACCTAAGGCAGAAAGCCGTCGCGATG +TATTAGCGTCGGCGGCTAAAATCGTCACCGACGGTTTGTAACCCCC diff --git a/test/genomes/flanking.fasta b/test/genomes/flanking.fasta new file mode 100644 index 0000000..c71212e --- /dev/null +++ b/test/genomes/flanking.fasta @@ -0,0 +1,16 @@ +>strainA +GGGGGGGGGGATGGTTAAAAAATCACTGCGCCAGTTCACGCTGATGGCGACGGCAACCGT +CACGCTGTTGTTAGGAAGTGTGCCGCTGTATGCGCAAACGGCGGACGTACAGCAAAAACT +TGCCGAATTAGAGCGGCAGTCGGGAGGCAGACTGGGTGTGGCATTGATTAACACAGCAGA +TAATTCGCAAATACTTTATCGTGCTGATGAGCGCTTTGCGATGTGCAGCACCAGTAAAGT +GATGGCCGCGGCCGCGGTGCTGAAGAAAAGTGAAAGCGAACCGAATCTGTTAAATCAGCG +AGTTGAGATCAAAAAATCTGACCTTGTTAACTATAATCCGATTGCGGAAAAGCACGTCAA +TGGGACGATGTCACTGGCTGAGCTTAGCGCGGCCGCGCTACAGTACAGCGATAACGTGGC +GATGAATAAGCTGATTGCTCACGTTGGCGGCCCGGCTAGCGTCACCGCGTTCGCCCGACA +GCTGGGAGACGAAACGTTCCGTCTCGACCGTACCGAGCCGACGTTAAACACCGCCATTCC +GGGCGATCCGCGTGATACCACTTCACCTCGGGCAATGGCGCAAACTCTGCGGAATCTGAC +GCTGGGTAAAGCATTGGGCGACAGCCAACGGGCGCAGCTGGTGACATGGATGAAAGGCAA +TACCACCGGTGCAGCGAGCATTCAGGCTGGACTGCCTGCTTCCTGGGTTGTGGGGGATAA +AACCGGCAGCGGTGGCTATGGCACCACCAACGATATCGCGGTGATCTGGCCAAAAGATCG +TGCGCCGCTGATTCTGGTCACTTACTTCACCCAGCCTCAACCTAAGGCAGAAAGCCGTCG +CGATGTATTAGCGTCGGCGGCTAAAATCGTCACCGACGGTTTGTAACCCCCCCCCC diff --git a/test/input/flanking.tsv b/test/input/flanking.tsv new file mode 100644 index 0000000..e4b7655 --- /dev/null +++ b/test/input/flanking.tsv @@ -0,0 +1,2 @@ +#FILE SEQUENCE START END STRAND GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION PRODUCT RESISTANCE +test/genomes/flanking.fasta strainA 11 886 + blaCTX-M-15 1-876/876 =============== 0/0 100.00 100.00 ncbi NG_048935.1 class A extended-spectrum beta-lactamase CTX-M-15 CEPHALOSPORIN