diff --git a/fsac/__init__.py b/fsac/__init__.py index 18df1df..81180e1 100644 --- a/fsac/__init__.py +++ b/fsac/__init__.py @@ -1,3 +1,3 @@ -__version__ = '1.0.4' +__version__ = '1.1.0' __author__ = 'Dillon Barker' __email__ = 'dillon.barker@canada.ca' diff --git a/fsac/allele_call.py b/fsac/allele_call.py index c70ab18..278ac0a 100644 --- a/fsac/allele_call.py +++ b/fsac/allele_call.py @@ -1,4 +1,5 @@ import json +import logging import subprocess import io import re @@ -16,6 +17,8 @@ def allele_call(genome: Path, genes: Path, output: Path) -> None: def blast(query_path: Path, genome_path: Path) -> pd.DataFrame: + logging.info('BLASTing locus %s in subject %s', query_path, genome_path) + out_format = '6 qseqid sseqid pident length qstart qend sstart send qlen \ slen bitscore gaps sseq qseq mismatch' @@ -81,7 +84,6 @@ def filter_result(blast_output: pd.DataFrame) -> Optional[pd.Series]: # Perfect match elif any(blast_output['correct']): - best = blast_output[blast_output['correct']] # Highest bitscore @@ -175,3 +177,5 @@ def unpack_value(value): with json_out.open('w') as j: json.dump(results, j, indent=4) + logging.info('Wrote results to %s', json_out) + diff --git a/fsac/main.py b/fsac/main.py index 4666a9c..dd444ba 100644 --- a/fsac/main.py +++ b/fsac/main.py @@ -1,4 +1,6 @@ import argparse +import logging +import os import sys from pathlib import Path @@ -7,14 +9,23 @@ from .update import update_directory from .tabulate import tabulate_calls +# Ensure numpy, via pandas, doesn't use more than 1 thread. +# If numpy uses multiple threads, it brings no performance benefit in this +# case, but can cause problems if you're running lots of jobs +# on a single HPC node +for env_var in ('OPENBLAS_NUM_THREADS', + 'OMP_NUM_THREADS', + 'MKL_NUM_THREADS', + 'NUMEXPR_NUM_THREADS'): + os.environ[env_var] = '1' def arguments(): parser = argparse.ArgumentParser() parser.add_argument('-v', '--version', - action='store_true', - help='Print version and exit') + action='version', + version=f'{parser.prog} {__version__}') parser.set_defaults(func=None) subparsers = parser.add_subparsers(title='Commands') @@ -91,10 +102,6 @@ def arguments(): args = parser.parse_args() - if args.version: - print('fsac', __version__) - sys.exit(0) - if args.func is None: parser.print_help() sys.exit(0) @@ -106,6 +113,12 @@ def main(): args = arguments() + logging.basicConfig(datefmt='%Y-%m-%d %H:%M', + format='%(asctime)s - %(levelname)s: %(message)s', + stream=sys.stderr, + level=logging.INFO) + + logging.info('Running %s', args.func.__name__) args.func(args) diff --git a/fsac/update.py b/fsac/update.py index 286ed74..86d8bd7 100644 --- a/fsac/update.py +++ b/fsac/update.py @@ -2,6 +2,7 @@ from pathlib import Path from contextlib import suppress import json +import logging SeqAllele = Tuple[Optional[str], Optional[str]] LocusData = Union[str, int, float, bool] @@ -67,20 +68,25 @@ def extend_hit(gene, threshold: int, genome_path: Path): of the alignment causes the alignment to not be extended. """ + logging.info('Extending hit for %s in %s', gene['QueryName'], genome_path) seq = gene['SubjAln'].replace('-', '') difference = gene['QueryLength'] - len(seq) if difference is 0: + logging.info('Difference is 0. Skipping') # handle a complete hit # return early return seq, gene['MarkerMatch'] if difference > threshold: + logging.info('Difference (%s) is greater than %s. Skipping', + difference, threshold) # handle large discrepancy # return early return None, None + # Open subject FASTA sequences_names = get_known_alleles(genome_path) names_sequences = {str(value.split()[0]): key @@ -103,8 +109,16 @@ def extend_hit(gene, threshold: int, genome_path: Path): else: - full_sequence = reverse_complement(contig[start : end]) + try: + full_sequence = reverse_complement(contig[start : end]) + + except KeyError: + msg = 'Cannot assign allele due to ambiguous subject sequence' + logging.info(msg) + return None, None + logging.info('Extended hit to length %s, expected %s', + len(full_sequence) - 1, gene['QueryLength']) return full_sequence, None @@ -115,7 +129,6 @@ def reverse_complement(sequence: str): 'T': 'A', 'G': 'C', 'C': 'G', - 'N': 'N' } return ''.join(complements[x] for x in reversed(sequence))