From e1da72bb57b2cca06f1fc949570716dedf5f9923 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 12:36:46 -0800 Subject: [PATCH 01/11] WIP: create proviral landscape output, for #16 --- Singularity | 2 +- gene_splicer/sample.py | 5 ++++ gene_splicer/utils.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 1 deletion(-) diff --git a/Singularity b/Singularity index 0ecd4df..9f5c9b6 100644 --- a/Singularity +++ b/Singularity @@ -13,7 +13,7 @@ From: centos:7 MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/ KIVE_INPUTS sample_info_csv contigs_csv conseqs_csv cascade_csv KIVE_OUTPUTS outcome_summary_csv conseqs_primers_csv contigs_primers_csv \ - table_precursor_csv hivseqinr_results_tar + table_precursor_csv proviral_landscape_csv hivseqinr_results_tar KIVE_THREADS 1 KIVE_MEMORY 6000 diff --git a/gene_splicer/sample.py b/gene_splicer/sample.py index 32ee20a..e49d684 100644 --- a/gene_splicer/sample.py +++ b/gene_splicer/sample.py @@ -40,6 +40,9 @@ def parse_args(): parser.add_argument('table_precursor_csv', help='Sequence data ready to upload', type=FileType('w')) + parser.add_argument('proviral_landscape_csv', + help='Data for proviral landscape plot', + type=FileType('w')) parser.add_argument('hivseqinr_results_tar', help="Archive file with HIVSeqinR's final results folder.", type=FileType('wb')) @@ -100,12 +103,14 @@ def main(): for file in fasta_files: gene_splicer.run(file, outdir=outpath) utils.generate_table_precursor(name=run_name, outpath=outpath) + utils.generate_proviral_landscape_csv(name=run_name, outpath=outpath) copy_output(outpath / 'outcome_summary.csv', args.outcome_summary_csv) copy_output(outpath / (run_name + '_conseqs_primer_analysis.csv'), args.conseqs_primers_csv) copy_output(outpath / (run_name + '_contigs_primer_analysis.csv'), args.contigs_primers_csv) copy_output(outpath / 'table_precursor.csv', args.table_precursor_csv) + copy_output(outpath / 'proviral_landscape.csv', args.proviral_landscape_csv) if __name__ == '__main__': diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 4168d45..c014ccd 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -1,3 +1,4 @@ +import csv import logging import os import re @@ -493,6 +494,65 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file, return table_precursorfile +def generate_proviral_landscape_csv(outpath): + proviral_landscape_csv = os.path.join(outpath, 'proviral_landscape.csv') + landscape_rows = [] + + table_precursor_csv = os.path.join(outpath, 'table_precursor.csv') + blastn_csv = glob.glob( + os.path.join( + outpath, + 'hivseqinr*', + 'Results_Intermediate', + 'Output_Blastn_HXB2MEGA28_tabdelim.txt' + ) + )[0] + + blastn_columns = ['qseqid', + 'qlen', + 'sseqid', + 'sgi', + 'slen', + 'qstart', + 'qend', + 'sstart', + 'send', + 'evalue', + 'bitscore', + 'length', + 'pident', + 'nident', + 'btop', + 'stitle', + 'sstrand'] + with open(blastn_csv, 'r') as blastn_file: + blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns) + for row in blastn_reader: + if row['qseqid'] in ['8E5LAV', 'HXB2']: + # skip the positive control rows + continue + # TODO: have to skip weird entries at start and end + [run_name, sample_name, reference, seqtype] = row['seqid'].split('::', expand=True) + landscape_entry = {'ref_start': row['sstart'], + 'ref_end': row['send'], + 'samp_name': sample_name, + 'samp_id': f"{run_name}::{sample_name}"} + landscape_rows.append(landscape_entry) + + with open(table_precursor_csv, 'r') as tab_prec: + tab_prec_reader = DictReader(tab_prec) + for row in tab_prec_reader: + samp_name = row['sample'] + verdict = row['MyVerdict'] + for entry in landscape_rows: + if entry['samp_name'] == samp_name: + entry['defect'] = verdict + + landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect'] + with open(proviral_landscape_csv, 'w') as landscape_file: + landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) + + def get_softclipped_region(query, alignment, alignment_path): try: size, op = alignment.iloc[0]['cigar'][0] From 3646119b8481392ea58baf4488dc2801d0581b8e Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 12:39:07 -0800 Subject: [PATCH 02/11] Finish writing csv --- gene_splicer/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index c014ccd..0dca6fe 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -551,6 +551,7 @@ def generate_proviral_landscape_csv(outpath): landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) + landscape_writer.writerows(landscape_rows) def get_softclipped_region(query, alignment, alignment_path): From e10990d7517d4a3c025b01778b740abe846d4c00 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 13:06:44 -0800 Subject: [PATCH 03/11] Fix mistakes --- gene_splicer/sample.py | 2 +- gene_splicer/utils.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gene_splicer/sample.py b/gene_splicer/sample.py index e49d684..5e49fb9 100644 --- a/gene_splicer/sample.py +++ b/gene_splicer/sample.py @@ -103,7 +103,7 @@ def main(): for file in fasta_files: gene_splicer.run(file, outdir=outpath) utils.generate_table_precursor(name=run_name, outpath=outpath) - utils.generate_proviral_landscape_csv(name=run_name, outpath=outpath) + utils.generate_proviral_landscape_csv(outpath) copy_output(outpath / 'outcome_summary.csv', args.outcome_summary_csv) copy_output(outpath / (run_name + '_conseqs_primer_analysis.csv'), args.conseqs_primers_csv) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 0dca6fe..acf4ed9 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -526,13 +526,14 @@ def generate_proviral_landscape_csv(outpath): 'stitle', 'sstrand'] with open(blastn_csv, 'r') as blastn_file: - blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns) + blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns, delimiter='\t') for row in blastn_reader: if row['qseqid'] in ['8E5LAV', 'HXB2']: # skip the positive control rows continue # TODO: have to skip weird entries at start and end - [run_name, sample_name, reference, seqtype] = row['seqid'].split('::', expand=True) + print(row['qseqid']) + [run_name, sample_name, reference, seqtype] = row['qseqid'].split('::') landscape_entry = {'ref_start': row['sstart'], 'ref_end': row['send'], 'samp_name': sample_name, @@ -733,4 +734,4 @@ def get_samples_from_cascade(cascade_csv: typing.IO, 'hxb2_end': 9632, 'direction': 'rev' } -} \ No newline at end of file +} From 9987471b124231a011aa2ac78e2fb3c3e67cf9c5 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 13:08:12 -0800 Subject: [PATCH 04/11] Remove extra entry from dict --- gene_splicer/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index acf4ed9..4def4c4 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -549,6 +549,9 @@ def generate_proviral_landscape_csv(outpath): if entry['samp_name'] == samp_name: entry['defect'] = verdict + for entry in landscape_rows: + del entry['samp_name'] + landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) From 7a444c0ff1499d90c8fb33be141a07d1ea31d21a Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 16:46:23 -0800 Subject: [PATCH 05/11] Skip LRT matches --- gene_splicer/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 4def4c4..86a6404 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -531,9 +531,10 @@ def generate_proviral_landscape_csv(outpath): if row['qseqid'] in ['8E5LAV', 'HXB2']: # skip the positive control rows continue - # TODO: have to skip weird entries at start and end - print(row['qseqid']) - [run_name, sample_name, reference, seqtype] = row['qseqid'].split('::') + if row['send'] < 638 or row['sstart'] > 9632: + # skip unspecific matches of LTR at start and end + continue + [run_name, sample_name, _, _] = row['qseqid'].split('::') landscape_entry = {'ref_start': row['sstart'], 'ref_end': row['send'], 'samp_name': sample_name, From b671cae51bd81b7cf8a7dca435b0b622696fab23 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 16:50:16 -0800 Subject: [PATCH 06/11] Add highlighted column --- gene_splicer/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 86a6404..3021629 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -538,7 +538,9 @@ def generate_proviral_landscape_csv(outpath): landscape_entry = {'ref_start': row['sstart'], 'ref_end': row['send'], 'samp_name': sample_name, - 'samp_id': f"{run_name}::{sample_name}"} + 'run_name': run_name, + 'highlighted': ''} + # highlighted is empty for now: can be filled manually landscape_rows.append(landscape_entry) with open(table_precursor_csv, 'r') as tab_prec: @@ -550,10 +552,7 @@ def generate_proviral_landscape_csv(outpath): if entry['samp_name'] == samp_name: entry['defect'] = verdict - for entry in landscape_rows: - del entry['samp_name'] - - landscape_columns = ['samp_id', 'ref_start', 'ref_end', 'defect'] + landscape_columns = ['samp_id', 'run_name', 'ref_start', 'ref_end', 'defect', 'highlighted'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) landscape_writer.writerows(landscape_rows) From 67e7c32238261dc82b0335d3782fff63a896eec1 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 16:50:56 -0800 Subject: [PATCH 07/11] Convert str to int --- gene_splicer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 3021629..fda899c 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -531,7 +531,7 @@ def generate_proviral_landscape_csv(outpath): if row['qseqid'] in ['8E5LAV', 'HXB2']: # skip the positive control rows continue - if row['send'] < 638 or row['sstart'] > 9632: + if int(row['send']) < 638 or int(row['sstart']) > 9632: # skip unspecific matches of LTR at start and end continue [run_name, sample_name, _, _] = row['qseqid'].split('::') From f2b06e2dff9d713f09b28037d91260cae810351c Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 16:55:43 -0800 Subject: [PATCH 08/11] Typo in the column name --- gene_splicer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index fda899c..43a429f 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -552,7 +552,7 @@ def generate_proviral_landscape_csv(outpath): if entry['samp_name'] == samp_name: entry['defect'] = verdict - landscape_columns = ['samp_id', 'run_name', 'ref_start', 'ref_end', 'defect', 'highlighted'] + landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'highlighted'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) landscape_writer.writerows(landscape_rows) From 20ef00080cf42af98790c10b55668770f3a34958 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Wed, 15 Feb 2023 17:32:54 -0800 Subject: [PATCH 09/11] Add header --- gene_splicer/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 43a429f..2eb9c66 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -555,6 +555,7 @@ def generate_proviral_landscape_csv(outpath): landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'highlighted'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) + landscape_writer.writeheader() landscape_writer.writerows(landscape_rows) From d89302ebbff25e3ce71595548daf9f194d8df1a4 Mon Sep 17 00:00:00 2001 From: CBeelen Date: Mon, 6 Mar 2023 11:17:38 -0800 Subject: [PATCH 10/11] Automatically recognize inverted regions --- gene_splicer/utils.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index 2eb9c66..e6a3635 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -14,6 +14,9 @@ logger = logging.getLogger(__name__) +LEFT_PRIMER_END = 666 +RIGHT_PRIMER_START = 9604 + def load_yaml(afile): with open(afile) as f: @@ -531,16 +534,26 @@ def generate_proviral_landscape_csv(outpath): if row['qseqid'] in ['8E5LAV', 'HXB2']: # skip the positive control rows continue - if int(row['send']) < 638 or int(row['sstart']) > 9632: + ref_start = int(row['sstart']) + ref_end = int(row['send']) + if ref_end <= LEFT_PRIMER_END or ref_start >= RIGHT_PRIMER_START: # skip unspecific matches of LTR at start and end continue [run_name, sample_name, _, _] = row['qseqid'].split('::') - landscape_entry = {'ref_start': row['sstart'], - 'ref_end': row['send'], + is_inverted = '' + if ref_end < ref_start: + # automatically recognize inverted regions + new_end = ref_start + ref_start = ref_end + ref_end = new_end + is_inverted = 'yes' + landscape_entry = {'ref_start': ref_start, + 'ref_end': ref_end, 'samp_name': sample_name, 'run_name': run_name, - 'highlighted': ''} - # highlighted is empty for now: can be filled manually + 'is_inverted': is_inverted, + 'is_defective': ''} + # is_defective is empty for now, will be filled manually landscape_rows.append(landscape_entry) with open(table_precursor_csv, 'r') as tab_prec: From bcceeb46d49b067670bb36d1b3ddd34f3e15a89d Mon Sep 17 00:00:00 2001 From: CBeelen Date: Mon, 6 Mar 2023 11:43:43 -0800 Subject: [PATCH 11/11] Fix proviral csv columns --- gene_splicer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py index e6a3635..b01b5bd 100644 --- a/gene_splicer/utils.py +++ b/gene_splicer/utils.py @@ -565,7 +565,7 @@ def generate_proviral_landscape_csv(outpath): if entry['samp_name'] == samp_name: entry['defect'] = verdict - landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'highlighted'] + landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'is_inverted', 'is_defective'] with open(proviral_landscape_csv, 'w') as landscape_file: landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns) landscape_writer.writeheader()