Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Di 1112 exomiser ranking fix #11

Merged
merged 10 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: pytest
on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8.18

- name: Install dependencies
run: pip install -r requirements.txt

- name: Run unit tests
run: python -m pytest resources/home/dnanexus/tests/test_make_workbook.py
4 changes: 2 additions & 2 deletions dxapp.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
"whatsNew": {
"1.0.0": "Initial version",
"1.1.0": "Fix to make compatible with older GEL JSONs",
"1.1.1": "Bug fix for workbooks with no GEL tiering variants"
"1.2.0": "Feedback changes, correct exomiser ranking, fix bugs"
},
"dxapi": "1.0.0",
"version": "1.1.1",
"version": "1.2.0",
"inputSpec": [
{
"name": "json",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ openpyxl==3.1.2
pandas==1.4.1
Pillow==9.2.0
pytest==7.0.1
pytest-subtests=0.13.1
pytest-subtests==0.13.1
pytz==2021.3
networkx==2.8.5
38 changes: 21 additions & 17 deletions resources/home/dnanexus/get_variant_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,46 +337,50 @@ def get_top_3_ranked(ranked):
ranked variants. It will return all variants at each rank; so all the
first ranked, all the second ranked and all the third ranked variants.
Inputs
ranked (list): list of Exomiser variants
ranked (dict): dict of indices and Exomiser ranks from exomiser df
Outputs:
to_report (list): top three Exomiser variants to report back
top (list): df indices of variants in the top three Exomiser ranks
'''
gold = []
silver = []
bronze = []
top = []

rank = lambda x: x['reportEvents']['vendorSpecificScores']['rank']
ordered_list = dict(sorted(ranked.items(), key=lambda item: item[1]))

ordered_list = sorted(ranked, key=rank)

for snv in ordered_list:
for k, v in ordered_list.items():
if not gold:
gold.append(snv)
gold.append(v)
top.append(k)
continue
else:
if rank(snv) == rank(gold[0]):
gold.append(snv)
if v == gold[0]:
gold.append(v)
top.append(k)
continue

if not silver:
silver.append(snv)
silver.append(v)
top.append(k)
continue
else:
if rank(snv) == rank(silver[0]):
silver.append(snv)
if v == silver[0]:
silver.append(v)
top.append(k)
continue

if not bronze:
bronze.append(snv)
bronze.append(v)
top.append(k)
continue
else:
if rank(snv) == rank(bronze[0]):
bronze.append(snv)
if v == bronze[0]:
bronze.append(v)
top.append(k)
continue
else:
break

return gold + silver + bronze
return top


class VariantNomenclature():
Expand Down
50 changes: 31 additions & 19 deletions resources/home/dnanexus/make_workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,9 +726,7 @@ def create_additional_analysis_page(self):
x for x in ranked if x['reportEvents']['score'] >= 0.75
]

to_report = VariantUtils.get_top_3_ranked(ranked_and_above_threshold)

for snv in to_report:
for snv in ranked_and_above_threshold:
# put reportevents dict within a list to allow it to have an index
snv['reportEvents'] = [snv['reportEvents']]
# event index will always be 0 as we have made it so there is only
Expand Down Expand Up @@ -784,30 +782,44 @@ def create_additional_analysis_page(self):
ex_df = pd.DataFrame(variant_list)
ex_df = ex_df.drop_duplicates()
if not ex_df.empty and not self.var_df.empty:
# Get list of all columns except 'Priority' column
col_except_priority = self.column_list
col_except_priority.remove('Priority')
# Convert all df columns to object type to allow merging without
# conflicts
ex_df = ex_df.astype(object)
self.var_df = self.var_df.astype(object)
# Merge exomiser df with tiered variants df, indicating if there
# is a difference between Priority
ex_df = ex_df.merge(
merge_df = ex_df.merge(
self.var_df,
left_on=col_except_priority,
right_on=col_except_priority,
indicator='_merge',
how='outer'
on=["Chr", 'Pos', 'Ref', 'Alt'],
how='left',
indicator=True
)
# Keep left only == keep only those that are in exomiser df and
# not in tiered df
ex_df = ex_df[ex_df['_merge'] == 'left_only']
# Clean up merge columns (drop _merge, and priority_y, rename
# priority_x)
ex_df = ex_df.drop(labels=['_merge'], axis='columns')
ex_df = ex_df.rename(columns={'Priority_x': 'Priority'})
ex_df = ex_df.drop(labels=['Priority_y'], axis='columns')
merge_df = merge_df[merge_df['_merge'] == 'left_only']
# Clean up merge columns (drop _merge columns ending _y, rename
# columns ending _x)
d = {}
for col in self.var_df.columns:
d[col + '_x'] = col
merge_df = merge_df.rename(columns=d)
merge_df = merge_df.drop(columns=['_merge'])
ex_df = merge_df[merge_df.columns.drop(
list(merge_df.filter(regex='.*\_y'))
)]

if not ex_df.empty:
# Now we have filtered out all variants that are in the GEL tiering
# page we need to get the top 3 ranks in the exomiser df
ex_df = ex_df.reset_index()
# Get dict of {index: int(exomiser rank)}
ranks = ex_df['Priority'].to_dict()
for k,v in ranks.items():
ranks[k] = int(v.split(' ')[-1].split('.')[0])
# Get indices of top ranked variants
top_ranked_indices = VariantUtils.get_top_3_ranked(ranks)
# Keep only top ranked variants + drop col with prev indices
ex_df = ex_df.iloc[top_ranked_indices]
ex_df = ex_df.drop(columns=['index'])

# Sort df by priority first, and then gene name
ex_df = ex_df.sort_values(['Priority', 'Gene'])

Expand Down
31 changes: 7 additions & 24 deletions resources/home/dnanexus/tests/test_make_workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,38 +226,21 @@ class TestRanking():
'''
Tests for ranking function
'''
snvs = [
{'reportEvents': {'vendorSpecificScores': {'rank': 1}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 2}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 4}}}
]
snvs = {0: 1, 1: 2, 2: 3, 3: 3, 4: 4}

def test_can_handle_two_bronze(self):
'''
Check both third ranked items are returned.
Check indices both third ranked items are returned.
'''
assert VariantUtils.get_top_3_ranked(self.snvs) == [
{'reportEvents': {'vendorSpecificScores': {'rank': 1}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 2}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}}
]
assert VariantUtils.get_top_3_ranked(self.snvs) == [0, 1, 2, 3]

def test_next_ranked_returned_if_no_items_at_rank(self):
'''
Check that third and forth ranked items are returned if there is no
second ranked item
Check that indices for the third and forth ranked items are returned if
there is no second ranked item
'''
self.snvs[1] = {'reportEvents': {'vendorSpecificScores': {'rank': 3}}}
assert VariantUtils.get_top_3_ranked(self.snvs) == [
{'reportEvents': {'vendorSpecificScores': {'rank': 1}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 3}}},
{'reportEvents': {'vendorSpecificScores': {'rank': 4}}}
]
self.snvs[1] = 3
assert VariantUtils.get_top_3_ranked(self.snvs) == [0, 1, 2, 3, 4]


class TestVariantNomenclature():
Expand Down
Loading