Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from HIVSeqinR to HIVIntact #23

Draft
wants to merge 47 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
55536fb
Add HIVIntact analysis, for #10.
donkirkby Jul 24, 2021
1d54987
Install mafft and blast in GitHub Actions, for #10.
donkirkby Jul 26, 2021
6b77cac
Enable more checks in HIVIntact
Donaim May 19, 2023
28ae510
Specify writable workdir for hivintact
Donaim Jul 14, 2023
9aadb15
Update dependencies in the README file
Donaim Jun 5, 2023
7cfd902
Add HIVIntact to our python dependencies
Donaim Jun 5, 2023
7f4eba1
Generate table_precursor table from hivintact output
Donaim Jun 9, 2023
891b4a8
Remember intact sequences
Donaim Jun 9, 2023
b88c4e4
Update hivintact errors list
Donaim Jun 24, 2023
0bca5be
Improve hivintact errors priority and translation
Donaim Jun 30, 2023
6ad62bd
Bump HIVIntact from cfe-1.2 to cfe-1.3
Donaim Jul 21, 2023
7e15cf0
Generate proviral_landscape for HIVIntact runs too
Donaim Jul 21, 2023
719a14b
Singularity: make HIVIntact the default and the only app
Donaim Jul 21, 2023
a1294df
Switch to csv outputs in HIVIntact
Donaim Jul 21, 2023
5c378e9
Rename "hivseqinr_results_tar" argument to "detailed_results_tar"
Donaim Jul 21, 2023
56d06d9
Bump HIVIntact from 1.3 to 1.4
Donaim Sep 19, 2023
eae7b0f
Do not translate HIVIntact errors to HIVSeqinR ones
Donaim Sep 19, 2023
775f100
Fix table precursor construction
Donaim Sep 20, 2023
120fe49
Add missing codes to the HIVINACT_ERRORS_TABLE
Donaim Sep 20, 2023
5367241
fix the check for when to use HIVSeqinR results
Donaim Mar 7, 2024
95ae330
Various code improvements
Donaim Mar 7, 2024
9d725a1
Switch to CFEIntact
Donaim Jul 8, 2024
1c5eca6
Update Pipfile.lock
Donaim Jul 8, 2024
3a0bf9d
Update cfeintact command
Donaim Jul 8, 2024
2345afa
Bump CFEIntact version
Donaim Jul 8, 2024
bc1d1ea
Use CFEIntact's python API instead of operating system's shell
Donaim Jul 8, 2024
514221c
Do not install cfeintact twice
Donaim Jul 8, 2024
1819951
Fix column name of CFEIntact errors.csv
Donaim Jul 9, 2024
05aac4f
Improve reading of .SAM files
Donaim Jul 9, 2024
49022e0
Small refactoring of utils.py
Donaim Jul 9, 2024
3d53ae8
Add a separate proviral_landscapes.py script
Donaim Jul 9, 2024
c1698b4
Fix defect column value for intact sequences
Donaim Jul 9, 2024
0925e73
Move all landscapes stuff to landscapes.py
Donaim Jul 9, 2024
c1ed309
Bump CFEIntact version
Donaim Jul 12, 2024
eea0151
Bump CFEIntact version
Donaim Jul 16, 2024
6d18b36
Bump CFEIntact version
Donaim Jul 26, 2024
2b9a7b6
Bump CFEIntact version
Donaim Jul 27, 2024
e68bfe5
Update CFEIntact errors table
Donaim Jul 27, 2024
75fcf9c
Bump CFEIntact version
Donaim Jul 28, 2024
731556b
Bump CFEIntact version
Donaim Jul 28, 2024
9454ade
Fix landscape generation based on sample_name
Donaim Jul 29, 2024
87aaffb
Fix ignoring intact sequences in table_precursor and landscapes_plots
Donaim Jul 29, 2024
5fc9340
Bump CFEIntact version
Donaim Jul 29, 2024
28950eb
Bump pandas version
Donaim Jul 30, 2024
6098677
Fix reading of SAM files
Donaim Jul 30, 2024
9f874da
Add missing UnknownNucleotide CFEIntact defect code
Donaim Jul 30, 2024
92c8d39
Add missing SequenceDivergence CFEIntact defect code
Donaim Jul 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Generate table_precursor table from hivintact output
Donaim committed Jul 21, 2023

Verified

This commit was signed with the committer’s verified signature.
commit 7f4eba1267a313ee602ae51548006b8eaaea964e
78 changes: 66 additions & 12 deletions gene_splicer/utils.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
import typing

import yaml
import json
import shutil
import subprocess as sp
import pandas as pd
@@ -391,14 +392,54 @@ def align(target_seq,
return alignment_path


def generate_table_precursor(name, outpath, add_columns=None):
# Output csv
precursor_path: Path = outpath / 'table_precursor.csv'
HIVINTACT_TRANSLATION_TABLE = {
'APOBECHypermutationDetected': 'Hypermut',
'LongDeletion': 'LargeDeletion',
'PackagingSignalDeletion': '5DEFECT',
'WrongORFNumber': 'PrematureStop_OR_AAtooLong_OR_AAtooShort',
}

# Load filtered sequences
filtered_path = outpath / (name + '_filtered.csv')
filtered = pd.read_csv(filtered_path)
# Load hivseqinr data
def translate_hivintact_error(error):
return HIVINTACT_TRANSLATION_TABLE.get(error, error)

HIVINTACT_ERRORS_TABLE = [
'NonHIV',
'LongDeletion',
'Scramble',
'InternalInversion',
'APOBECHypermutationDetected',
'MisplacedORF',
'WrongORFNumber',
'DeletionInOrf',
'FrameshiftInOrf',
'MajorSpliceDonorSiteMutated',
'PackagingSignalDeletion',
'PackagingSignalNotComplete',
'RevResponseElementDeletion',
]

def iterate_hivintact_data(name, outpath):
for d in glob.glob(str(outpath / 'hivintact*')):
for (SEQID, sequence) in read_fasta(os.path.join(d, 'intact.fasta')):
row = [SEQID, 'Intact']
yield row

with open(os.path.join(d, 'errors.json'), 'r') as f:
js = json.load(f)
for SEQID in js:
all_errors = [obj.get('error') for obj in js[SEQID] if 'error' in obj]
if all_errors:
ordered = sorted(all_errors, key=HIVINTACT_ERRORS_TABLE.index)
verdict = translate_hivintact_error(ordered[0])
row = [SEQID, verdict]
yield row

def get_hivintact_data(name, outpath):
column_names = ['SEQID', 'MyVerdict']
data = iterate_hivintact_data(name, outpath)
return pd.DataFrame(data, columns=column_names)

def get_hivseqinr_data(name, outpath):
seqinr_paths = glob.glob(
str(outpath / 'hivseqinr*' / 'Results_Final' /
'Output_MyBigSummary_DF_FINAL.csv'))
@@ -409,13 +450,26 @@ def generate_table_precursor(name, outpath, add_columns=None):
part = pd.read_csv(path)
parts.append(part)
# seqinr = pd.read_csv(seqinr_path)
return pd.concat(parts)

def generate_table_precursor(name, outpath, add_columns=None):
# Output csv
precursor_path: Path = outpath / 'table_precursor.csv'

# Load filtered sequences
filtered_path = outpath / (name + '_filtered.csv')
filtered = pd.read_csv(filtered_path)
# Load hivseqinr data or HIVIntact results
results = get_hivintact_data(name, outpath)
if results.empty:
results = get_hivseqinr_data(name, outpath)

try:
seqinr = pd.concat(parts)
# Assign new columns based on split
seqinr[['name', 'sample', 'reference',
'seqtype']] = seqinr['SEQID'].str.split('::', expand=True)
results[['name', 'sample', 'reference',
'seqtype']] = results['SEQID'].str.split('::', expand=True)
# Merge
merged = seqinr.merge(filtered, on='sample')
merged = results.merge(filtered, on='sample')
except ValueError:
with precursor_path.open('w') as output_file:
writer = DictWriter(output_file,
@@ -448,7 +502,7 @@ def generate_table_precursor(name, outpath, add_columns=None):
if add_columns:
for key, val in add_columns.items():
merged[key] = val
if parts:
if not results.empty:
merged[['sample', 'sequence', 'MyVerdict'] + genes_of_interest].to_csv(
precursor_path, index=False)
else: