cfe-lab · CBeelen · Jun 16, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/Singularity b/Singularity
@@ -13,7 +13,7 @@ From: centos:7
     MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/
     KIVE_INPUTS sample_info_csv contigs_csv conseqs_csv cascade_csv
     KIVE_OUTPUTS outcome_summary_csv conseqs_primers_csv contigs_primers_csv \
-        table_precursor_csv hivseqinr_results_tar
+        table_precursor_csv proviral_landscape_csv hivseqinr_results_tar
     KIVE_THREADS 1
     KIVE_MEMORY 6000
 

diff --git a/gene_splicer/sample.py b/gene_splicer/sample.py
@@ -40,6 +40,9 @@ def parse_args():
     parser.add_argument('table_precursor_csv',
                         help='Sequence data ready to upload',
                         type=FileType('w'))
+    parser.add_argument('proviral_landscape_csv',
+                        help='Data for proviral landscape plot',
+                        type=FileType('w'))
     parser.add_argument('hivseqinr_results_tar',
                         help="Archive file with HIVSeqinR's final results folder.",
                         type=FileType('wb'))
@@ -100,12 +103,14 @@ def main():
     for file in fasta_files:
         gene_splicer.run(file, outdir=outpath)
     utils.generate_table_precursor(name=run_name, outpath=outpath)
+    utils.generate_proviral_landscape_csv(outpath)
     copy_output(outpath / 'outcome_summary.csv', args.outcome_summary_csv)
     copy_output(outpath / (run_name + '_conseqs_primer_analysis.csv'),
                 args.conseqs_primers_csv)
     copy_output(outpath / (run_name + '_contigs_primer_analysis.csv'),
                 args.contigs_primers_csv)
     copy_output(outpath / 'table_precursor.csv', args.table_precursor_csv)
+    copy_output(outpath / 'proviral_landscape.csv', args.proviral_landscape_csv)
 
 
 if __name__ == '__main__':

diff --git a/gene_splicer/utils.py b/gene_splicer/utils.py
@@ -1,3 +1,4 @@
+import csv
 import logging
 import os
 import re
@@ -13,6 +14,9 @@
 
 logger = logging.getLogger(__name__)
 
+LEFT_PRIMER_END = 666
+RIGHT_PRIMER_START = 9604
+
 
 def load_yaml(afile):
     with open(afile) as f:
@@ -493,6 +497,81 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file,
     return table_precursorfile
 
 
+def generate_proviral_landscape_csv(outpath):
+    proviral_landscape_csv = os.path.join(outpath, 'proviral_landscape.csv')
+    landscape_rows = []
+
+    table_precursor_csv = os.path.join(outpath, 'table_precursor.csv')
+    blastn_csv = glob.glob(
+        os.path.join(
+            outpath,
+            'hivseqinr*',
+            'Results_Intermediate',
+            'Output_Blastn_HXB2MEGA28_tabdelim.txt'
+        )
+    )[0]
+
+    blastn_columns = ['qseqid',
+                      'qlen',
+                      'sseqid',
+                      'sgi',
+                      'slen',
+                      'qstart',
+                      'qend',
+                      'sstart',
+                      'send',
+                      'evalue',
+                      'bitscore',
+                      'length',
+                      'pident',
+                      'nident',
+                      'btop',
+                      'stitle',
+                      'sstrand']
+    with open(blastn_csv, 'r') as blastn_file:
+        blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns, delimiter='\t')
+        for row in blastn_reader:
+            if row['qseqid'] in ['8E5LAV', 'HXB2']:
+                # skip the positive control rows
+                continue
+            ref_start = int(row['sstart'])
+            ref_end = int(row['send'])
+            if ref_end <= LEFT_PRIMER_END or ref_start >= RIGHT_PRIMER_START:
+                # skip unspecific matches of LTR at start and end
+                continue
+            [run_name, sample_name, _, _] = row['qseqid'].split('::')
+            is_inverted = ''
+            if ref_end < ref_start:
+                # automatically recognize inverted regions
+                new_end = ref_start
+                ref_start = ref_end
+                ref_end = new_end
+                is_inverted = 'yes'
+            landscape_entry = {'ref_start': ref_start,
+                               'ref_end': ref_end,
+                               'samp_name': sample_name,
+                               'run_name': run_name,
+                               'is_inverted': is_inverted,
+                               'is_defective': ''}
+            # is_defective is empty for now, will be filled manually
+            landscape_rows.append(landscape_entry)
+
+    with open(table_precursor_csv, 'r') as tab_prec:
+        tab_prec_reader = DictReader(tab_prec)
+        for row in tab_prec_reader:
+            samp_name = row['sample']
+            verdict = row['MyVerdict']
+            for entry in landscape_rows:
+                if entry['samp_name'] == samp_name:
+                    entry['defect'] = verdict
+
+    landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'is_inverted', 'is_defective']
+    with open(proviral_landscape_csv, 'w') as landscape_file:
+        landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns)
+        landscape_writer.writeheader()
+        landscape_writer.writerows(landscape_rows)
+
+
 def get_softclipped_region(query, alignment, alignment_path):
     try:
         size, op = alignment.iloc[0]['cigar'][0]
@@ -672,4 +751,4 @@ def get_samples_from_cascade(cascade_csv: typing.IO,
         'hxb2_end': 9632,
         'direction': 'rev'
     }
-}
+}