Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Util fix #105

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions web/handlers/genomics/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from scipy.stats import beta
import pandas as pd


def calculate_proportion(_x, _n):
x = _x.round()
n = _n.round()
Expand All @@ -19,7 +20,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col):
df
.set_index(index_col)
.reindex(idx, fill_value = 0)
.drop(grp_col, axis = 1)
.reset_index()
.rename(
columns = {
Expand Down Expand Up @@ -197,25 +197,38 @@ def create_nested_mutation_query(location_id = None, lineages = [], mutations =
parse_location_id_to_query(location_id, query_obj)
return query_obj

def classify_other_category(grp, keep_lineages):
def classify_other_category(grp, keep_lineages): # Understood as ignores any lineages user want to keep
grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" # Temporarily remove none. TODO: Proper fix
grp = grp.groupby("lineage").agg({
"total_count": lambda x: x.iloc[0],
"lineage_count": "sum"
})
return grp

def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):
date_limit = dt.today() - timedelta(days = ndays)
lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts()
num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):

df['prevalence'] = df['total_count']/df['lineage_count']
df = df.sort_values(by="date") #Sort date values


if min_date and max_date:
df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you just want to filter by date here, and not by prevalence just yet. Later, you're removing the lineages which don't have enough days above the prevalence threshold, but we still want to return data for low-prevalence days for lineages that are above the threshold.

num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found
elif min_date:
date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward
df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)]
num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0]
else:
date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back
df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)]
num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you've already filtered the df at this point, I think you can skip the [df["date"] <= date_limit] part of this line and the two similar ones above, and then just have one line that does this after the if statement.


if num_unique_dates < nday_threshold:
nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
lineages_to_retain = lineages_to_retain[lineages_to_retain >= nday_threshold].index.tolist()
lineages_to_retain.extend(keep_lineages)
nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you put your prevalence threshold filter down here instead, you should get the same counts as your current version, but there won't be gaps in the dataframe on low prevalence days

lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped
keep_lineages.extend(lineages_to_retain)
df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain)
df = df.reset_index()
df.loc[:,"prevalence"] = df["lineage_count"]/df["total_count"]
return df

def parse_location_id_to_query(query_id, query_obj = None):
Expand Down