From f9f84f71443be7a273f64e4def76645ee76ca563 Mon Sep 17 00:00:00 2001 From: donkirkby Date: Fri, 23 Jul 2021 17:16:32 -0700 Subject: [PATCH] Add HIVIntact analysis, for #10. --- Singularity | 19 +++++++++++++++ gene_splicer/primer_finder.py | 37 +++++++++++++++++++++++++---- gene_splicer/primer_finder_class.py | 2 +- gene_splicer/sample.py | 18 +++++++++++--- gene_splicer/study_summary.py | 13 +++++++--- 5 files changed, 78 insertions(+), 11 deletions(-) diff --git a/Singularity b/Singularity index d938b8e..5f18945 100644 --- a/Singularity +++ b/Singularity @@ -28,6 +28,9 @@ From: ubuntu:22.04 fontconfig libbz2-dev liblzma-dev libssl-dev \ libffi-dev libsqlite3-dev + echo ===== Installing MAFFT ===== >/dev/null + apt-get install -y mafft + echo ===== Installing Python ===== >/dev/null apt-get install -y python3 python3-pip @@ -35,6 +38,8 @@ From: ubuntu:22.04 apt-get install -y ncbi-blast+ echo ===== Installing Python packages ===== >/dev/null + pip3 install git+https://github.com/cfe-lab/HIVIntact@cfe-1.2 + cd /opt/primer_finder pip3 install . @@ -55,3 +60,17 @@ From: ubuntu:22.04 %runscript gene_splicer_sample --hivseqinr /opt/hivseqinr "$@" + +%apprun hivintact + gene_splicer_sample --hivintact "$@" + +%apphelp hivintact + Search proviral consensus sequences for primers, then use HIVIntact to + decide if the genomes are complete. + +%applabels hivintact + KIVE_INPUTS sample_info_csv contigs_csv conseqs_csv cascade_csv + KIVE_OUTPUTS outcome_summary_csv conseqs_primers_csv contigs_primers_csv \ + table_precursor_csv hivseqinr_results_tar + KIVE_THREADS 1 + KIVE_MEMORY 200 diff --git a/gene_splicer/primer_finder.py b/gene_splicer/primer_finder.py index dbb741e..62dbdcc 100644 --- a/gene_splicer/primer_finder.py +++ b/gene_splicer/primer_finder.py @@ -1,4 +1,5 @@ import re +import subprocess from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, FileType from csv import DictReader, DictWriter from itertools import groupby @@ -77,6 +78,9 @@ def parse_args(): help="Path to HIVSeqinR source code, or download " "destination. HIVSeqinR will be skipped if this " "isn't given.") + parser.add_argument('--hivintact', + action='store_true', + help="Launch the HIVIntact analysis.") parser.add_argument( '--nodups', action='store_false', @@ -386,8 +390,8 @@ def add_primers(row): def remove_primers(row): # Strip the primers out, convert index values from floats. newseq = row.sequence[ - int(row.fwd_sample_primer_size + row.fwd_sample_primer_start) - :-int(row.rev_sample_primer_size + row.rev_sample_primer_start)] + int(row.fwd_sample_primer_size + row.fwd_sample_primer_start): + -int(row.rev_sample_primer_size + row.rev_sample_primer_start)] row.sequence = newseq return row @@ -422,6 +426,13 @@ def archive_hivseqinr_results(working_path: Path, archive.add(result_path, result_path.name) +def archive_hivintact_results(working_path: Path, + hivintact_results_tar: typing.IO): + archive = TarFile(fileobj=hivintact_results_tar, mode='w') + for result_path in working_path.iterdir(): + archive.add(result_path, result_path.name) + + def run(contigs_csv, conseqs_csv, cascade_csv, @@ -433,7 +444,9 @@ def run(contigs_csv, sample_size=50, force_all_proviral=False, default_sample_name: str = None, - hivseqinr_results_tar: typing.IO = None): + hivseqinr_results_tar: typing.IO = None, + run_hivintact: bool = False, + hivintact_results_tar: typing.IO = None): all_samples = utils.get_samples_from_cascade(cascade_csv, default_sample_name) @@ -517,6 +530,21 @@ def run(contigs_csv, if hivseqinr_results_tar is not None: archive_hivseqinr_results(working_path, hivseqinr_results_tar) + if run_hivintact: + working_path: Path = outpath / f'hivintact_{i}' + working_path.mkdir(exist_ok=True) + with (working_path / 'hiv-intact.log').open('w') as log_file: + subprocess.run(['proviral', + 'intact', + '--subtype=B', + str(no_primers_fasta)], + check=True, + stdout=log_file, + stderr=subprocess.STDOUT, + cwd=working_path) + if hivintact_results_tar is not None: + archive_hivintact_results(working_path, + hivintact_results_tar) files.append(no_primers_fasta) return files @@ -531,7 +559,8 @@ def main(): hivseqinr=args.hivseqinr, nodups=args.nodups, split=args.split, - sample_size=args.sample_size) + sample_size=args.sample_size, + run_hivintact=args.hivintact) return {'fasta_files': fasta_files, 'args': args} diff --git a/gene_splicer/primer_finder_class.py b/gene_splicer/primer_finder_class.py index d5f8cd5..a4f5cf9 100644 --- a/gene_splicer/primer_finder_class.py +++ b/gene_splicer/primer_finder_class.py @@ -144,7 +144,7 @@ def get_slices(self): hxb2_slice = utils.hxb2[self.hxb2_start - self.validation_size:self.hxb2_end] if len(sample_slice) == 0: - logger.debug(\ + logger.debug( 'Sample slice size is 0! \n' f'start: {self.start} \n' f'end: {self.end} \n' diff --git a/gene_splicer/sample.py b/gene_splicer/sample.py index 5e49fb9..858782b 100644 --- a/gene_splicer/sample.py +++ b/gene_splicer/sample.py @@ -44,7 +44,8 @@ def parse_args(): help='Data for proviral landscape plot', type=FileType('w')) parser.add_argument('hivseqinr_results_tar', - help="Archive file with HIVSeqinR's final results folder.", + help="Archive file with HIVSeqinR's final results " + "folder, or HIVIntact's results.", type=FileType('wb')) parser.add_argument( '-p', @@ -57,6 +58,9 @@ def parse_args(): help="Path to HIVSeqinR source code, or download " "destination. HIVSeqinR will be skipped if this " "isn't given.") + parser.add_argument('--hivintact', + action='store_true', + help="Launch the HIVIntact analysis.") parser.add_argument( '--nodups', action='store_false', @@ -88,10 +92,16 @@ def main(): info_reader = DictReader(args.sample_info_csv) sample_info: dict = next(info_reader) run_name = sample_info.get('run_name', 'kive_run') + if args.hivintact: + hivseqinr_results_tar = None + hivintact_results_tar = args.hivseqinr_results_tar + else: + hivseqinr_results_tar = args.hivseqinr_results_tar + hivintact_results_tar = None fasta_files = primer_finder.run(contigs_csv=args.contigs_csv, conseqs_csv=args.conseqs_csv, cascade_csv=args.cascade_csv, - hivseqinr_results_tar=args.hivseqinr_results_tar, + hivseqinr_results_tar=hivseqinr_results_tar, name=run_name, outpath=outpath, hivseqinr=args.hivseqinr, @@ -99,7 +109,9 @@ def main(): split=args.split, sample_size=args.sample_size, force_all_proviral=True, - default_sample_name=sample_info['sample']) + default_sample_name=sample_info['sample'], + run_hivintact=args.hivintact, + hivintact_results_tar=hivintact_results_tar) for file in fasta_files: gene_splicer.run(file, outdir=outpath) utils.generate_table_precursor(name=run_name, outpath=outpath) diff --git a/gene_splicer/study_summary.py b/gene_splicer/study_summary.py index 74a5652..45efe1f 100644 --- a/gene_splicer/study_summary.py +++ b/gene_splicer/study_summary.py @@ -43,6 +43,10 @@ def parse_args(): 'the runs in samples_csv. Any samples not found ' 'in samples.csv will guess the participant id ' 'from the first part of the sample name.') + parser.add_argument('--hivintact', + action='store_true', + help="Launch the HIVIntact analysis instead of " + "HIVSeqinR.") return parser.parse_args() @@ -197,7 +201,7 @@ def write_warnings(self, report_file: typing.TextIO, limit: int = None): file=report_file) -def run_gene_splicer(run_path: Path, outcome_folder: Path): +def run_gene_splicer(run_path: Path, outcome_folder: Path, run_hivintact: bool): version_results_path = run_path / 'Results' / 'version_7.14' assert version_results_path.exists(), version_results_path denovo_path = version_results_path / 'denovo' @@ -214,11 +218,14 @@ def run_gene_splicer(run_path: Path, outcome_folder: Path): pipeline_args = [python_path, '-m', 'gene_splicer.pipeline', '--outpath', str(outcome_folder), - '--hivseqinr', str(hivseqinr_path), contigs_path, conseq_path, cascade_path, short_run_name] + if run_hivintact: + pipeline_args.append('--hivintact') + else: + pipeline_args.append(f'--hivseqinr={hivseqinr_path}') try: with log_path.open('w') as log_file: run(pipeline_args, @@ -259,7 +266,7 @@ def main(): print('Missing denovo results:', run_path) continue else: - run_gene_splicer(run_path, outcome_path.parent) + run_gene_splicer(run_path, outcome_path.parent, args.hivintact) assert outcome_path.exists(), outcome_path print('.', end='', flush=True) dots_printed = True