-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Util fix #105
base: master
Are you sure you want to change the base?
Util fix #105
Changes from 5 commits
1b98334
546ad8b
64daf21
0797d1f
233ee00
849427b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
from scipy.stats import beta | ||
import pandas as pd | ||
|
||
|
||
def calculate_proportion(_x, _n): | ||
x = _x.round() | ||
n = _n.round() | ||
|
@@ -19,7 +20,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col): | |
df | ||
.set_index(index_col) | ||
.reindex(idx, fill_value = 0) | ||
.drop(grp_col, axis = 1) | ||
.reset_index() | ||
.rename( | ||
columns = { | ||
|
@@ -197,25 +197,38 @@ def create_nested_mutation_query(location_id = None, lineages = [], mutations = | |
parse_location_id_to_query(location_id, query_obj) | ||
return query_obj | ||
|
||
def classify_other_category(grp, keep_lineages): | ||
def classify_other_category(grp, keep_lineages): # Understood as ignores any lineages user want to keep | ||
grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" # Temporarily remove none. TODO: Proper fix | ||
grp = grp.groupby("lineage").agg({ | ||
"total_count": lambda x: x.iloc[0], | ||
"lineage_count": "sum" | ||
}) | ||
return grp | ||
|
||
def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): | ||
date_limit = dt.today() - timedelta(days = ndays) | ||
lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts() | ||
num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] | ||
def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): | ||
|
||
df['prevalence'] = df['total_count']/df['lineage_count'] | ||
df = df.sort_values(by="date") #Sort date values | ||
|
||
|
||
if min_date and max_date: | ||
df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)] | ||
num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found | ||
elif min_date: | ||
date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward | ||
df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)] | ||
num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0] | ||
else: | ||
date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back | ||
df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)] | ||
num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you've already filtered the df at this point, I think you can skip the |
||
|
||
if num_unique_dates < nday_threshold: | ||
nday_threshold = round((nday_threshold/ndays) * num_unique_dates) | ||
lineages_to_retain = lineages_to_retain[lineages_to_retain >= nday_threshold].index.tolist() | ||
lineages_to_retain.extend(keep_lineages) | ||
nday_threshold = round((nday_threshold/ndays) * num_unique_dates) | ||
lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you put your prevalence threshold filter down here instead, you should get the same counts as your current version, but there won't be gaps in the dataframe on low prevalence days |
||
lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped | ||
keep_lineages.extend(lineages_to_retain) | ||
df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain) | ||
df = df.reset_index() | ||
df.loc[:,"prevalence"] = df["lineage_count"]/df["total_count"] | ||
return df | ||
|
||
def parse_location_id_to_query(query_id, query_obj = None): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you just want to filter by date here, and not by prevalence just yet. Later, you're removing the lineages which don't have enough days above the prevalence threshold, but we still want to return data for low-prevalence days for lineages that are above the threshold.