diff --git a/extract_genes_abricate.py b/extract_genes_abricate.py index 5b19467..136b1f4 100644 --- a/extract_genes_abricate.py +++ b/extract_genes_abricate.py @@ -11,252 +11,392 @@ import tempfile def check_outdir(outdir): - logging.debug(f"Calling function check_outdir") - logging.debug(f"Checking whether {outdir} exists") - if not os.path.exists(outdir): - logging.debug(f"{outdir} does not exist, making this now") - os.makedirs(outdir) + """ + Ensure the existence of the output directory. + + Args: + outdir (str): The path to the output directory. + + Returns: + None + """ + logging.debug(f"Calling function check_outdir") + logging.debug(f"Checking whether {outdir} exists") + if not os.path.exists(outdir): + logging.debug(f"{outdir} does not exist, making this now") + os.makedirs(outdir) def read_abricatefile(abricatefile, csv): - logging.debug(f"Calling function read_abricatefile") - logging.debug(f"Checking whether ABRicate file should be read as a csv file: {csv}") - if csv == True: - logging.debug(f"Reading {abricatefile} as a csv file") - df = pd.read_csv(abricatefile, sep = ',') - else: - logging.debug(f"Reading {abricatefile} as a tsv file") - df = pd.read_csv(abricatefile, sep = '\t') - return df + """ + Read the ABRicate file and return its contents as a DataFrame. + + Args: + abricatefile (str): The path to the ABRicate file. + csv (bool): True if the ABRicate file is in CSV format, False for TSV. + + Returns: + pd.DataFrame: The ABRicate data as a DataFrame. + """ + logging.debug(f"Calling function read_abricatefile") + logging.debug(f"Checking whether ABRicate file should be read as a csv file: {csv}") + if csv == True: + logging.debug(f"Reading {abricatefile} as a csv file") + df = pd.read_csv(abricatefile, sep = ',') + else: + logging.debug(f"Reading {abricatefile} as a tsv file") + df = pd.read_csv(abricatefile, sep = '\t') + return df def check_combination(combination, combinations_passed): - logging.debug(f"Calling function check_combination") - found_duplicate = False - logging.debug(f"Checking whether {combination} is in {combinations_passed}") - if combination not in combinations_passed: - logging.debug(f"{combination} was not found. Is added to checked combinations") - combinations_passed[combination] = 1 - checked_combination = combination - else: - logging.debug(f"{combination} was found in checked combinations. Marked as duplicate") - combinations_passed[combination] += 1 - combination_count = combinations_passed[combination] - logging.debug(f"Joining {combination} with {combination_count}") - checked_combination = '_'.join([combination, str(combination_count)]) - found_duplicate = True - return checked_combination, combinations_passed, found_duplicate + """ + Check if a combination exists in the passed combinations, and handle duplicates. + + Args: + combination (str): The combination to check. + combinations_passed (dict): A dictionary of combinations already processed. + + Returns: + Tuple[str, dict, bool]: A tuple containing the checked combination, + updated combinations dictionary, and a flag indicating if it's a duplicate. + """ + logging.debug(f"Calling function check_combination") + found_duplicate = False + logging.debug(f"Checking whether {combination} is in {combinations_passed}") + if combination not in combinations_passed: + logging.debug(f"{combination} was not found. Is added to checked combinations") + combinations_passed[combination] = 1 + checked_combination = combination + else: + logging.debug(f"{combination} was found in checked combinations. Marked as duplicate") + combinations_passed[combination] += 1 + combination_count = combinations_passed[combination] + logging.debug(f"Joining {combination} with {combination_count}") + checked_combination = '_'.join([combination, str(combination_count)]) + found_duplicate = True + return checked_combination, combinations_passed, found_duplicate def parse_row(row, combinations_passed, suffix, genomedir): - logging.debug(f"Calling function parse_row") - logging.debug(f"Constructing strain name") - strain = str(os.path.basename(row['#FILE'])).replace(suffix, '') - logging.debug(f"Genome name is constructed based on {genomedir} and {strain} and {suffix}") - genome = genomedir + '/' + strain + suffix - combination = strain + '_' + row['GENE'] - logging.debug(f"Filtering {combination} using regex") - combination = re.sub('[^\w\-_\. ]', '_', combination) - logging.debug(f"Name is now {combination}") - logging.debug(f"{combination} is checked against previously constructed combinations") - checked_combination, combinations_passed, found_duplicate = check_combination(combination, combinations_passed) - if found_duplicate: - logging.warning(f"{combination} is found more than once in {strain}. Writing output sequence to {checked_combination}") - logging.debug(f"Constructing output file name for {checked_combination}") - output = args.outdir + '/' + checked_combination + '.out' - return genome, checked_combination, output + """ + Parse a row from the ABRicate file and construct output file information. + + Args: + row (pd.Series): The row to be processed. + combinations_passed (dict): A dictionary of combinations already processed. + suffix (str): The suffix used for constructing genome names. + genomedir (str): The directory containing genome files. + + Returns: + Tuple[str, str, str]: A tuple containing the genome name, checked combination, + and output file path. + """ + logging.debug(f"Calling function parse_row") + logging.debug(f"Constructing strain name") + strain = str(os.path.basename(row['#FILE'])).replace(suffix, '') + logging.debug(f"Genome name is constructed based on {genomedir} and {strain} and {suffix}") + genome = genomedir + '/' + strain + suffix + combination = strain + '_' + row['GENE'] + logging.debug(f"Filtering {combination} using regex") + combination = re.sub('[^\w\-_\. ]', '_', combination) + logging.debug(f"Name is now {combination}") + logging.debug(f"{combination} is checked against previously constructed combinations") + checked_combination, combinations_passed, found_duplicate = check_combination(combination, combinations_passed) + if found_duplicate: + logging.warning(f"{combination} is found more than once in {strain}. Writing output sequence to {checked_combination}") + logging.debug(f"Constructing output file name for {checked_combination}") + output = args.outdir + '/' + checked_combination + '.out' + return genome, checked_combination, output def process_sense(row, genome, output): - logging.debug(f"Calling function process_sense") - START = row['START'] - 1 - END = row['END'] - START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) - writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) - logging.debug(f"{writestring} is written to NamedTemporaryFile") - with tempfile.NamedTemporaryFile(mode='w+') as tf: - tf.write(writestring) - tf.seek(0) - logging.debug(f"seqtk is called using subprocess for {genome}") - with open(output, 'w+') as out: - subprocess.call(["seqtk", "subseq", genome, tf.name], stdout=out) - - logging.debug(f"{output} is read using SeqIO") - record = SeqIO.read(output, "fasta") - - return record + """ + Process a row with a gene in sense orientation and write the sequence to an output file. + + Args: + row (pd.Series): The row representing the gene. + genome (str): The path to the genome file. + output (str): The path to the output file. + + Returns: + SeqRecord: The processed sequence record. + """ + logging.debug(f"Calling function process_sense") + START = row['START'] - 1 + END = row['END'] + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) + writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) + logging.debug(f"{writestring} is written to NamedTemporaryFile") + with tempfile.NamedTemporaryFile(mode='w+') as tf: + tf.write(writestring) + tf.seek(0) + logging.debug(f"seqtk is called using subprocess for {genome}") + with open(output, 'w+') as out: + subprocess.call(["seqtk", "subseq", genome, tf.name], stdout=out) + + logging.debug(f"{output} is read using SeqIO") + record = SeqIO.read(output, "fasta") + + return record def process_antisense(row, genome, output): - logging.debug(f"Calling function process_antisense") - START = row['START'] - 1 - END = row['END'] - START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) - writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) - logging.debug(f"{writestring} is written to NamedTemporaryFile") - with tempfile.NamedTemporaryFile(mode='w+') as tf: - tf.write(writestring) - tf.seek(0) - logging.debug(f"seqtk is called using subprocess for {genome}") - with tempfile.NamedTemporaryFile(mode='w+') as rev: - subprocess.call(["seqtk", "subseq", genome, tf.name], stdout = rev) - rev.seek(0) - logging.debug(f"NamedTemporaryFile is read using SeqIO and reverse complemented") - record = SeqIO.read(rev.name, "fasta").reverse_complement() - - return record + """ + Process a row with a gene in antisense orientation, reverse complement it, and write to an output file. + + Args: + row (pd.Series): The row representing the gene. + genome (str): The path to the genome file. + output (str): The path to the output file. + + Returns: + SeqRecord: The processed sequence record. + """ + logging.debug(f"Calling function process_antisense") + START = row['START'] - 1 + END = row['END'] + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) + writestring = ' '.join([str(row['SEQUENCE']), str(START_updated), str(END_updated)]) + logging.debug(f"{writestring} is written to NamedTemporaryFile") + with tempfile.NamedTemporaryFile(mode='w+') as tf: + tf.write(writestring) + tf.seek(0) + logging.debug(f"seqtk is called using subprocess for {genome}") + with tempfile.NamedTemporaryFile(mode='w+') as rev: + subprocess.call(["seqtk", "subseq", genome, tf.name], stdout = rev) + rev.seek(0) + logging.debug(f"NamedTemporaryFile is read using SeqIO and reverse complemented") + record = SeqIO.read(rev.name, "fasta").reverse_complement() + + return record def parse_multiple_rows(df, suffix, genomedir): - logging.debug(f"Calling function parse_multiple_rows") - logging.debug(f"Assert whether only one FILE is found") - original_file_name = df['#FILE'].unique() - assert len(original_file_name) == 1 - logging.debug(f"Constructing strain name based on {original_file_name}") - strain = str(os.path.basename(original_file_name[0]).replace(suffix, '')) - logging.debug(f"Constructing genome name based on {genomedir} and {strain} and {suffix}") - genome = genomedir + '/' + strain + suffix - logging.debug(f"Asser whether only one DATABASE is found") - original_db = df['DATABASE'].unique() - assert len(original_db) == 1 - combination = strain + '_' + original_db[0] - logging.debug(f"Filtering {combination} using regex") - combination = re.sub('[^\w\-_\. ]', '_', combination) - logging.debug(f"Name is now {combination}") - logging.debug(f"Constructing output file name for {combination}") - output = args.outdir + '/' + combination + '.out' - return genome, combination, output + """ + Parse multiple rows at once and construct output file information for gene clusters. + + Args: + df (pd.DataFrame): The DataFrame containing gene cluster information. + suffix (str): The suffix used for constructing genome names. + genomedir (str): The directory containing genome files. + + Returns: + Tuple[str, str, str]: A tuple containing the genome name, combination, and output file path. + """ + logging.debug(f"Calling function parse_multiple_rows") + logging.debug(f"Assert whether only one FILE is found") + original_file_name = df['#FILE'].unique() + assert len(original_file_name) == 1 + logging.debug(f"Constructing strain name based on {original_file_name}") + strain = str(os.path.basename(original_file_name[0]).replace(suffix, '')) + logging.debug(f"Constructing genome name based on {genomedir} and {strain} and {suffix}") + genome = genomedir + '/' + strain + suffix + logging.debug(f"Asser whether only one DATABASE is found") + original_db = df['DATABASE'].unique() + assert len(original_db) == 1 + combination = strain + '_' + original_db[0] + logging.debug(f"Filtering {combination} using regex") + combination = re.sub('[^\w\-_\. ]', '_', combination) + logging.debug(f"Name is now {combination}") + logging.debug(f"Constructing output file name for {combination}") + output = args.outdir + '/' + combination + '.out' + return genome, combination, output def find_gene_boundary_extremes(df): - logging.debug(f"Calling function find_gene_boundary_extremes") - logging.debug(f"Finding gene boundaries based on whole file") - list_extreme_gene_boundaries = [df['START'].min(), df['START'].max(), df['END'].min(), df['END'].max()] - return min(list_extreme_gene_boundaries), max(list_extreme_gene_boundaries) + """ + Find the extreme gene boundaries in a DataFrame. + + Args: + df (pd.DataFrame): The DataFrame containing gene information. + + Returns: + Tuple[int, int]: A tuple containing the minimum and maximum gene boundaries. + """ + logging.debug(f"Calling function find_gene_boundary_extremes") + logging.debug(f"Finding gene boundaries based on whole file") + list_extreme_gene_boundaries = [df['START'].min(), df['START'].max(), df['END'].min(), df['END'].max()] + return min(list_extreme_gene_boundaries), max(list_extreme_gene_boundaries) def update_record(record, combination): - logging.debug(f"Calling function update_record") - logging.debug(f"Updating record ID using {combination}") - record.id = combination - logging.debug(f"Removing description from record") - record.description = '' - return record + """ + Update the ID and remove description of a SeqRecord. + + Args: + record (SeqRecord): The sequence record to be updated. + combination (str): The combination to set as the new ID. + + Returns: + SeqRecord: The updated sequence record. + """ + logging.debug(f"Calling function update_record") + logging.debug(f"Updating record ID using {combination}") + record.id = combination + logging.debug(f"Removing description from record") + record.description = '' + return record def process_flanking(start, end, flanking, flanking_bp): - if flanking: - start = max(0, start - flanking_bp) - end = end + flanking_bp - return start, end + """ + Adjust gene boundaries to include flanking sequences if necessary. + + Args: + start (int): The start position of the gene. + end (int): The end position of the gene. + flanking (bool): Whether flanking sequences should be extracted. + flanking_bp (int): The length of flanking sequence to extract in base pairs. + + Returns: + Tuple[int, int]: A tuple containing the updated start and end positions. + """ + if flanking: + start = max(0, start - flanking_bp) + end = end + flanking_bp + return start, end def main_genes(df, args): - logging.debug(f"Calling function main_genes") - combinations_passed = {} - logging.debug(f"Looping through df") - for index, row in df.iterrows(): - logging.debug(f"Parsing row {index}") - genome, checked_combination, output = parse_row(row, combinations_passed, args.suffix, args.genomedir) + """ + Main function for extracting individual genes from ABRicate output. + + Args: + df (pd.DataFrame): The DataFrame containing ABRicate output. + args (argparse.Namespace): The command-line arguments. + + Returns: + None + """ + logging.debug(f"Calling function main_genes") + combinations_passed = {} + logging.debug(f"Looping through df") + for index, row in df.iterrows(): + logging.debug(f"Parsing row {index}") + genome, checked_combination, output = parse_row(row, combinations_passed, args.suffix, args.genomedir) + logging.debug(f"Assert that STRAND column is found in abricatefile") + assert 'STRAND' in row, "STRAND column is not found in ABRicate file, was ABRicate version 0.9.8 or later?" + if row['STRAND'] != '-': + logging.debug(f"Processing row with gene in sense") + record = process_sense(row, genome, output) + else: + logging.debug(f"Processing row with gene in antisense") + record = process_antisense(row, genome, output) + logging.debug(f"Updating record with {checked_combination}") + updated_record = update_record(record, checked_combination) + logging.debug(f"Writing updated record to {output} as fasta file") + SeqIO.write(updated_record, output, "fasta") + +def main_genecluster(df, args): + """ + Main function for extracting gene clusters from ABRicate output. + + Args: + df (pd.DataFrame): The DataFrame containing ABRicate output. + args (argparse.Namespace): The command-line arguments. + + Returns: + None + """ + # This function needs refactoring + logging.debug(f"Calling function main_genecluster") + logging.debug(f"Parse multiple rows at once for gene cluster processing") + genome, combination, output = parse_multiple_rows(df, args.suffix, args.genomedir) logging.debug(f"Assert that STRAND column is found in abricatefile") - assert 'STRAND' in row, "STRAND column is not found in ABRicate file, was ABRicate version 0.9.8 or later?" - if row['STRAND'] != '-': - logging.debug(f"Processing row with gene in sense") - record = process_sense(row, genome, output) + assert 'STRAND' in df, "STRAND column is not found in ABRicate file, was ABRicate version 0.9.8 or later?" + logging.debug(f"Checking the number of contigs on which hits are found") + value_count_contigs = df['SEQUENCE'].value_counts() + total_nr_contigs = len(value_count_contigs) + total_nr_hits = value_count_contigs.sum() + most_common_contig = value_count_contigs.index[0] + logging.debug(f"Total number of contigs in file are checked") + if total_nr_contigs > 1: + logging.warning(f"Hits were found on {total_nr_contigs}. Contig {most_common_contig} has most hits and will be selected") + if (value_count_contigs[0] / total_nr_hits) < 0.5: + logging.warning(f"Contig {most_common_contig} has most hits, but contains less than half of all hits") + elif value_count_contigs[0] == value_count_contigs[1]: + logging.warning(f"The same number of hits were found on (at least) the top two contigs. Top contig is selected based on value_counts() sorting") + df_most_common_contig = df[df['SEQUENCE'] == most_common_contig] + logging.debug(f"Find gene boundary extremes") + START, END = find_gene_boundary_extremes(df_most_common_contig) + START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) + logging.debug(f"Construct pandas Series mimicking an object from df.iterrows") + combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START_updated, 'END': END_updated}) + logging.debug(f"Check how many hits are sense or antisense") + strand_series = df_most_common_contig['STRAND'].value_counts() + # Check if all genes are in same orientation + if ('+' in strand_series) and ('-' in strand_series): + logging.debug(f"Sense and antisense hits in abricate output") + # If there are more antisense than sense hits, reverse complement the sequence before writing to file + # If a result is written the majority of genes should be sense by then + if strand_series.loc['-'] > strand_series.loc['+']: + logging.debug(f"More antisense hits are found") + decision_strand = 'antisense' + else: + logging.debug(f"More sense hits are found") + decision_strand = 'sense' + elif '-' not in strand_series: + logging.debug(f"No antisense hits in file: processing as sense") + decision_strand = 'sense' + elif '+' not in strand_series: + logging.debug(f"No sense hits in file: processing as antisense") + decision_strand = 'antisense' + + logging.debug(f"decision_strand has value {decision_strand}") + if decision_strand == 'sense': + logging.debug(f"Processing combined_row with gene cluster in sense") + record = process_sense(combined_row, genome, output) + elif decision_strand == 'antisense': + logging.debug(f"Processing combined_row with gene cluster in antisense") + record = process_antisense(combined_row, genome, output) else: - logging.debug(f"Processing row with gene in antisense") - record = process_antisense(row, genome, output) - logging.debug(f"Updating record with {checked_combination}") - updated_record = update_record(record, checked_combination) + logging.critical(f"decision_strand is {decision_strand}. This is an invalid value") + logging.debug(f"Updating record with {combination}") + updated_record = update_record(record, combination) logging.debug(f"Writing updated record to {output} as fasta file") SeqIO.write(updated_record, output, "fasta") -def main_genecluster(df, args): - logging.debug(f"Calling function main_genecluster") - logging.debug(f"Parse multiple rows at once for gene cluster processing") - genome, combination, output = parse_multiple_rows(df, args.suffix, args.genomedir) - logging.debug(f"Assert that STRAND column is found in abricatefile") - assert 'STRAND' in df, "STRAND column is not found in ABRicate file, was ABRicate version 0.9.8 or later?" - logging.debug(f"Checking the number of contigs on which hits are found") - value_count_contigs = df['SEQUENCE'].value_counts() - total_nr_contigs = len(value_count_contigs) - total_nr_hits = value_count_contigs.sum() - most_common_contig = value_count_contigs.index[0] - logging.debug(f"Total number of contigs in file are checked") - if total_nr_contigs > 1: - logging.warning(f"Hits were found on {total_nr_contigs}. Contig {most_common_contig} has most hits and will be selected") - if (value_count_contigs[0] / total_nr_hits) < 0.5: - logging.warning(f"Contig {most_common_contig} has most hits, but contains less than half of all hits") - elif value_count_contigs[0] == value_count_contigs[1]: - logging.warning(f"The same number of hits were found on (at least) the top two contigs. Top contig is selected based on value_counts() sorting") - df_most_common_contig = df[df['SEQUENCE'] == most_common_contig] - logging.debug(f"Find gene boundary extremes") - START, END = find_gene_boundary_extremes(df_most_common_contig) - START_updated, END_updated = process_flanking(START, END, args.flanking, args.flanking_bp) - logging.debug(f"Construct pandas Series mimicking an object from df.iterrows") - combined_row = pd.Series({'SEQUENCE': most_common_contig, 'START': START_updated, 'END': END_updated}) - logging.debug(f"Check how many hits are sense or antisense") - strand_series = df_most_common_contig['STRAND'].value_counts() - if ('+' in strand_series) and ('-' in strand_series): - logging.debug(f"Sense and antisense hits in abricate output") - if strand_series.loc['-'] > strand_series.loc['+']: - logging.debug(f"More antisense hits are found") - decision_strand = 'antisense' - else: - logging.debug(f"More sense hits are found") - decision_strand = 'sense' - elif '-' not in strand_series: - logging.debug(f"No antisense hits in file: processing as sense") - decision_strand = 'sense' - elif '+' not in strand_series: - logging.debug(f"No sense hits in file: processing as antisense") - decision_strand = 'antisense' - - logging.debug(f"decision_strand has value {decision_strand}") - if decision_strand == 'sense': - logging.debug(f"Processing combined_row with gene cluster in sense") - record = process_sense(combined_row, genome, output) - elif decision_strand == 'antisense': - logging.debug(f"Processing combined_row with gene cluster in antisense") - record = process_antisense(combined_row, genome, output) - else: - logging.critical(f"decision_strand is {decision_strand}. This is an invalid value") - logging.debug(f"Updating record with {combination}") - updated_record = update_record(record, combination) - logging.debug(f"Writing updated record to {output} as fasta file") - SeqIO.write(updated_record, output, "fasta") - def main(args): - logging.debug(f"Calling function main") - check_outdir(args.outdir) - df = read_abricatefile(args.abricatefile, args.csv) - if df.shape[0] > 0: - if args.genecluster == False: - logging.debug(f"Executing main_genes for extraction of single genes") - main_genes(df, args) + """ + The main function for processing ABRicate output and extracting genes or gene clusters. + + Args: + args (argparse.Namespace): The command-line arguments. + + Returns: + None + """ + logging.debug(f"Calling function main") + check_outdir(args.outdir) + df = read_abricatefile(args.abricatefile, args.csv) + if df.shape[0] > 0: + if args.genecluster == False: + logging.debug(f"Executing main_genes for extraction of single genes") + main_genes(df, args) + else: + logging.debug(f"Executing main_genecluster for extraction of gene clusters") + main_genecluster(df, args) else: - logging.debug(f"Executing main_genecluster for extraction of gene clusters") - main_genecluster(df, args) - else: - logging.warning(f"No hits found in {args.abricatefile}, outputting empty directory {args.outdir}") + logging.warning(f"No hits found in {args.abricatefile}, outputting empty directory {args.outdir}") if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Extract genes from genes based on ABRicate output.') - - parser.add_argument("-a", "--abricatefile", dest="abricatefile", help="ABRicate file to parse genes", metavar="ABRICATE FILE", required=True, type=str) - parser.add_argument("-g", "--genomedir", dest="genomedir", help="directory containing genomes", metavar="GENOMES DIR", required=True, type=str) - parser.add_argument("-o", "--output", dest="outdir", help="directory for output", metavar="OUTPUT DIR", required=True, type=str) - parser.add_argument("-s", "--suffix", dest="suffix", default=".fasta", help="Genome assembly file suffix (default: .fasta)") - parser.add_argument("--genecluster", dest="genecluster", action="store_true", help="Extract all genes to a single fasta if located on a single contig (default: false)") - parser.add_argument("--csv", dest="csv", action="store_true", help="Use this option if your ABRicate output file is comma-separated (default: parse as tab-separated file).") - parser.add_argument("--flanking", dest="flanking", action="store_true", help="Extract flanking sequences") - parser.add_argument("--flanking-bp", dest="flanking_bp", default=100, metavar="FLANKING LENGTH", type=int, help="Length of flanking sequence to extract in bp (default: 100)") - - parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="Increase verbosity") - - args = parser.parse_args() + parser = argparse.ArgumentParser(description='Extract genes from genes based on ABRicate output.') + + parser.add_argument("-a", "--abricatefile", dest="abricatefile", help="ABRicate file to parse genes", metavar="ABRICATE FILE", required=True, type=str) + parser.add_argument("-g", "--genomedir", dest="genomedir", help="directory containing genomes", metavar="GENOMES DIR", required=True, type=str) + parser.add_argument("-o", "--output", dest="outdir", help="directory for output", metavar="OUTPUT DIR", required=True, type=str) + parser.add_argument("-s", "--suffix", dest="suffix", default=".fasta", help="Genome assembly file suffix (default: .fasta)") + parser.add_argument("--genecluster", dest="genecluster", action="store_true", help="Extract all genes to a single fasta if located on a single contig (default: false)") + parser.add_argument("--csv", dest="csv", action="store_true", help="Use this option if your ABRicate output file is comma-separated (default: parse as tab-separated file).") + parser.add_argument("--flanking", dest="flanking", action="store_true", help="Extract flanking sequences") + parser.add_argument("--flanking-bp", dest="flanking_bp", default=100, metavar="FLANKING LENGTH", type=int, help="Length of flanking sequence to extract in bp (default: 100)") + + parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="Increase verbosity") + + args = parser.parse_args() - if not args.verbose: - logging.basicConfig(level=logging.WARNING) - elif args.verbose == 1: - ## This can be set to INFO level if more levels are needed - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.DEBUG) - - # If user has specified --flanking-bp to another value than 100 bp, set flanking to True. Does not work if --flanking-bp 100 is specified - if (args.flanking_bp != 100) & (args.flanking == False): - logging.error(f"--flanking-bp is specified but command does include --flanking flag. Flanking sequences are not extracted.") - - logging.debug(f"Executing main function") - main(args) + if not args.verbose: + logging.basicConfig(level=logging.WARNING) + elif args.verbose == 1: + ## This can be set to INFO level if more levels are needed + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.DEBUG) + + # If user has specified --flanking-bp to another value than 100 bp, set flanking to True. Does not work if --flanking-bp 100 is specified + if (args.flanking_bp != 100) & (args.flanking == False): + logging.error(f"--flanking-bp is specified but command does include --flanking flag. Flanking sequences are not extracted.") + + logging.debug(f"Executing main function") + main(args)