Skip to content

Commit

Permalink
Add flanking sequence extraction first try
Browse files Browse the repository at this point in the history
  • Loading branch information
boasvdp committed Feb 18, 2022
1 parent 5ffcc6f commit 8d10a44
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 5 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ jobs:
# Testing for gene cluster
python extract_genes_abricate.py -a test/input/test.tsv -g test/genomes -o test/out_genecluster -s .fasta --genecluster
cmp test/correct/correct_genecluster.fasta test/out_genecluster/input_ncbi.out
# Testing with flanking sequences
python extract_genes_abricate.py -a test/input/flanking.tsv -g test/genomes -o test/out_flanking --flanking --flanking-bp 5
cmp test/correct/correct_flanking.out test/out_flanking/flanking_blaCTX-M-15.out
- name: Tests with combined ABRicate files from multiple genomes
run: |
# Normal gene extraction should work for files with multiple genomes
Expand Down
24 changes: 19 additions & 5 deletions extract_genes_abricate.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def process_sense(row, genome, output):
logging.debug(f"Calling function process_sense")
START = row['START'] - 1
END = row['END']
writestring = ' '.join([str(row['SEQUENCE']), str(START), str(END)])
START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp)
writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)])
logging.debug(f"{writestring} is written to NamedTemporaryFile")
with tempfile.NamedTemporaryFile(mode='w+') as tf:
tf.write(writestring)
Expand All @@ -85,7 +86,8 @@ def process_antisense(row, genome, output):
logging.debug(f"Calling function process_antisense")
START = row['START'] - 1
END = row['END']
writestring = ' '.join([str(row['SEQUENCE']), str(START), str(END)])
START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp)
writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)])
logging.debug(f"{writestring} is written to NamedTemporaryFile")
with tempfile.NamedTemporaryFile(mode='w+') as tf:
tf.write(writestring)
Expand Down Expand Up @@ -133,6 +135,12 @@ def update_record(record, combination):
record.description = ''
return record

def process_flanking(start, end, flanking, flanking_bp):
if flanking:
start = max(0, start - flanking_bp)
end = end + flanking_bp
return start, end

def main_genes(df, args):
logging.debug(f"Calling function main_genes")
combinations_passed = {}
Expand Down Expand Up @@ -174,10 +182,9 @@ def main_genecluster(df, args):
df_most_common_contig = df[df['SEQUENCE'] == most_common_contig]
logging.debug(f"Find gene boundary extremes")
START, END = find_gene_boundary_extremes(df_most_common_contig)

START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp)
logging.debug(f"Construct pandas Series mimicking an object from df.iterrows")
combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START, 'END': END})

combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START_updated, 'END': END_updated})
logging.debug(f"Check how many hits are sense or antisense")
strand_series = df_most_common_contig['STRAND'].value_counts()
if ('+' in strand_series) and ('-' in strand_series):
Expand Down Expand Up @@ -229,6 +236,9 @@ def main(args):
parser.add_argument("-s", "--suffix", dest="suffix", default=".fasta", help="Genome assembly file suffix (default: .fasta)")
parser.add_argument("--genecluster", dest="genecluster", action="store_true", help="Extract all genes to a single fasta if located on a single contig (default: false)")
parser.add_argument("--csv", dest="csv", action="store_true", help="Use this option if your ABRicate output file is comma-separated (default: parse as tab-separated file).")
parser.add_argument("--flanking", dest="flanking", action="store_true", help="Extract flanking sequences")
parser.add_argument("--flanking-bp", dest="flanking_bp", default=100, metavar="FLANKING LENGTH", type=int, help="Length of flanking sequence to extract in bp (default: 100)")

parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="Increase verbosity")

args = parser.parse_args()
Expand All @@ -241,5 +251,9 @@ def main(args):
else:
logging.basicConfig(level=logging.DEBUG)

# If user has specified --flanking-bp to another value than 100 bp, set flanking to True. Does not work if --flanking-bp 100 is specified
if (args.flanking_bp != 100) & (args.flanking == False):
logging.error(f"--flanking-bp is specified but command does include --flanking flag. Flanking sequences are not extracted.")

logging.debug(f"Executing main function")
main(args)
16 changes: 16 additions & 0 deletions test/correct/correct_flanking.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>flanking_blaCTX-M-15
GGGGGATGGTTAAAAAATCACTGCGCCAGTTCACGCTGATGGCGACGGCAACCGTCACGC
TGTTGTTAGGAAGTGTGCCGCTGTATGCGCAAACGGCGGACGTACAGCAAAAACTTGCCG
AATTAGAGCGGCAGTCGGGAGGCAGACTGGGTGTGGCATTGATTAACACAGCAGATAATT
CGCAAATACTTTATCGTGCTGATGAGCGCTTTGCGATGTGCAGCACCAGTAAAGTGATGG
CCGCGGCCGCGGTGCTGAAGAAAAGTGAAAGCGAACCGAATCTGTTAAATCAGCGAGTTG
AGATCAAAAAATCTGACCTTGTTAACTATAATCCGATTGCGGAAAAGCACGTCAATGGGA
CGATGTCACTGGCTGAGCTTAGCGCGGCCGCGCTACAGTACAGCGATAACGTGGCGATGA
ATAAGCTGATTGCTCACGTTGGCGGCCCGGCTAGCGTCACCGCGTTCGCCCGACAGCTGG
GAGACGAAACGTTCCGTCTCGACCGTACCGAGCCGACGTTAAACACCGCCATTCCGGGCG
ATCCGCGTGATACCACTTCACCTCGGGCAATGGCGCAAACTCTGCGGAATCTGACGCTGG
GTAAAGCATTGGGCGACAGCCAACGGGCGCAGCTGGTGACATGGATGAAAGGCAATACCA
CCGGTGCAGCGAGCATTCAGGCTGGACTGCCTGCTTCCTGGGTTGTGGGGGATAAAACCG
GCAGCGGTGGCTATGGCACCACCAACGATATCGCGGTGATCTGGCCAAAAGATCGTGCGC
CGCTGATTCTGGTCACTTACTTCACCCAGCCTCAACCTAAGGCAGAAAGCCGTCGCGATG
TATTAGCGTCGGCGGCTAAAATCGTCACCGACGGTTTGTAACCCCC
16 changes: 16 additions & 0 deletions test/genomes/flanking.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>strainA
GGGGGGGGGGATGGTTAAAAAATCACTGCGCCAGTTCACGCTGATGGCGACGGCAACCGT
CACGCTGTTGTTAGGAAGTGTGCCGCTGTATGCGCAAACGGCGGACGTACAGCAAAAACT
TGCCGAATTAGAGCGGCAGTCGGGAGGCAGACTGGGTGTGGCATTGATTAACACAGCAGA
TAATTCGCAAATACTTTATCGTGCTGATGAGCGCTTTGCGATGTGCAGCACCAGTAAAGT
GATGGCCGCGGCCGCGGTGCTGAAGAAAAGTGAAAGCGAACCGAATCTGTTAAATCAGCG
AGTTGAGATCAAAAAATCTGACCTTGTTAACTATAATCCGATTGCGGAAAAGCACGTCAA
TGGGACGATGTCACTGGCTGAGCTTAGCGCGGCCGCGCTACAGTACAGCGATAACGTGGC
GATGAATAAGCTGATTGCTCACGTTGGCGGCCCGGCTAGCGTCACCGCGTTCGCCCGACA
GCTGGGAGACGAAACGTTCCGTCTCGACCGTACCGAGCCGACGTTAAACACCGCCATTCC
GGGCGATCCGCGTGATACCACTTCACCTCGGGCAATGGCGCAAACTCTGCGGAATCTGAC
GCTGGGTAAAGCATTGGGCGACAGCCAACGGGCGCAGCTGGTGACATGGATGAAAGGCAA
TACCACCGGTGCAGCGAGCATTCAGGCTGGACTGCCTGCTTCCTGGGTTGTGGGGGATAA
AACCGGCAGCGGTGGCTATGGCACCACCAACGATATCGCGGTGATCTGGCCAAAAGATCG
TGCGCCGCTGATTCTGGTCACTTACTTCACCCAGCCTCAACCTAAGGCAGAAAGCCGTCG
CGATGTATTAGCGTCGGCGGCTAAAATCGTCACCGACGGTTTGTAACCCCCCCCCC
2 changes: 2 additions & 0 deletions test/input/flanking.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#FILE SEQUENCE START END STRAND GENE COVERAGE COVERAGE_MAP GAPS %COVERAGE %IDENTITY DATABASE ACCESSION PRODUCT RESISTANCE
test/genomes/flanking.fasta strainA 11 886 + blaCTX-M-15 1-876/876 =============== 0/0 100.00 100.00 ncbi NG_048935.1 class A extended-spectrum beta-lactamase CTX-M-15 CEPHALOSPORIN

0 comments on commit 8d10a44

Please sign in to comment.