Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proviral landscape plots #22

Merged
merged 12 commits into from
Jun 16, 2023
2 changes: 1 addition & 1 deletion Singularity
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ From: centos:7
MAINTAINER BC CfE in HIV/AIDS https://github.com/cfe-lab/
KIVE_INPUTS sample_info_csv contigs_csv conseqs_csv cascade_csv
KIVE_OUTPUTS outcome_summary_csv conseqs_primers_csv contigs_primers_csv \
table_precursor_csv hivseqinr_results_tar
table_precursor_csv proviral_landscape_csv hivseqinr_results_tar
KIVE_THREADS 1
KIVE_MEMORY 6000

Expand Down
5 changes: 5 additions & 0 deletions gene_splicer/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def parse_args():
parser.add_argument('table_precursor_csv',
help='Sequence data ready to upload',
type=FileType('w'))
parser.add_argument('proviral_landscape_csv',
help='Data for proviral landscape plot',
type=FileType('w'))
parser.add_argument('hivseqinr_results_tar',
help="Archive file with HIVSeqinR's final results folder.",
type=FileType('wb'))
Expand Down Expand Up @@ -100,12 +103,14 @@ def main():
for file in fasta_files:
gene_splicer.run(file, outdir=outpath)
utils.generate_table_precursor(name=run_name, outpath=outpath)
utils.generate_proviral_landscape_csv(outpath)
copy_output(outpath / 'outcome_summary.csv', args.outcome_summary_csv)
copy_output(outpath / (run_name + '_conseqs_primer_analysis.csv'),
args.conseqs_primers_csv)
copy_output(outpath / (run_name + '_contigs_primer_analysis.csv'),
args.contigs_primers_csv)
copy_output(outpath / 'table_precursor.csv', args.table_precursor_csv)
copy_output(outpath / 'proviral_landscape.csv', args.proviral_landscape_csv)


if __name__ == '__main__':
Expand Down
81 changes: 80 additions & 1 deletion gene_splicer/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import logging
import os
import re
Expand All @@ -13,6 +14,9 @@

logger = logging.getLogger(__name__)

LEFT_PRIMER_END = 666
RIGHT_PRIMER_START = 9604


def load_yaml(afile):
with open(afile) as f:
Expand Down Expand Up @@ -493,6 +497,81 @@ def generate_table_precursor_2(hivseqinr_resultsfile, filtered_file,
return table_precursorfile


def generate_proviral_landscape_csv(outpath):
proviral_landscape_csv = os.path.join(outpath, 'proviral_landscape.csv')
landscape_rows = []

table_precursor_csv = os.path.join(outpath, 'table_precursor.csv')
blastn_csv = glob.glob(
os.path.join(
outpath,
'hivseqinr*',
'Results_Intermediate',
'Output_Blastn_HXB2MEGA28_tabdelim.txt'
)
)[0]

blastn_columns = ['qseqid',
'qlen',
'sseqid',
'sgi',
'slen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore',
'length',
'pident',
'nident',
'btop',
'stitle',
'sstrand']
with open(blastn_csv, 'r') as blastn_file:
blastn_reader = DictReader(blastn_file, fieldnames=blastn_columns, delimiter='\t')
for row in blastn_reader:
if row['qseqid'] in ['8E5LAV', 'HXB2']:
# skip the positive control rows
continue
ref_start = int(row['sstart'])
ref_end = int(row['send'])
if ref_end <= LEFT_PRIMER_END or ref_start >= RIGHT_PRIMER_START:
# skip unspecific matches of LTR at start and end
continue
[run_name, sample_name, _, _] = row['qseqid'].split('::')
is_inverted = ''
if ref_end < ref_start:
# automatically recognize inverted regions
new_end = ref_start
ref_start = ref_end
ref_end = new_end
is_inverted = 'yes'
landscape_entry = {'ref_start': ref_start,
'ref_end': ref_end,
'samp_name': sample_name,
'run_name': run_name,
'is_inverted': is_inverted,
'is_defective': ''}
# is_defective is empty for now, will be filled manually
landscape_rows.append(landscape_entry)

with open(table_precursor_csv, 'r') as tab_prec:
tab_prec_reader = DictReader(tab_prec)
for row in tab_prec_reader:
samp_name = row['sample']
verdict = row['MyVerdict']
for entry in landscape_rows:
if entry['samp_name'] == samp_name:
entry['defect'] = verdict

landscape_columns = ['samp_name', 'run_name', 'ref_start', 'ref_end', 'defect', 'is_inverted', 'is_defective']
with open(proviral_landscape_csv, 'w') as landscape_file:
landscape_writer = csv.DictWriter(landscape_file, fieldnames=landscape_columns)
landscape_writer.writeheader()
landscape_writer.writerows(landscape_rows)


def get_softclipped_region(query, alignment, alignment_path):
try:
size, op = alignment.iloc[0]['cigar'][0]
Expand Down Expand Up @@ -672,4 +751,4 @@ def get_samples_from_cascade(cascade_csv: typing.IO,
'hxb2_end': 9632,
'direction': 'rev'
}
}
}