From 1b9833415c0bea4e623ed190df515fc7311bcd39 Mon Sep 17 00:00:00 2001 From: Sarah Randall Date: Thu, 24 Aug 2023 21:03:46 -0700 Subject: [PATCH 1/5] added date params --- web/handlers/genomics/util.py | 64 +++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index 2eaeb36d..ce5e451c 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -2,6 +2,24 @@ from scipy.stats import beta import pandas as pd +import os +os.chdir('/Users/sarahrandall/Downloads') + +data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True) + +# min_date="2022-03-15" +# max_date="2022-03-20" +# prevalence_threshold = 0.05 + +# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)] +#index_col should always be "date" + + + + +# # data['proportion'] = data['proportion'].apply(lambda x: x*100) + + def calculate_proportion(_x, _n): x = _x.round() n = _n.round() @@ -19,7 +37,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col): df .set_index(index_col) .reindex(idx, fill_value = 0) - .drop(grp_col, axis = 1) .reset_index() .rename( columns = { @@ -197,7 +214,7 @@ def create_nested_mutation_query(location_id = None, lineages = [], mutations = parse_location_id_to_query(location_id, query_obj) return query_obj -def classify_other_category(grp, keep_lineages): +def classify_other_category(grp, keep_lineages): # Understood as ignores any lineages user want to keep grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" # Temporarily remove none. TODO: Proper fix grp = grp.groupby("lineage").agg({ "total_count": lambda x: x.iloc[0], @@ -205,18 +222,37 @@ def classify_other_category(grp, keep_lineages): }) return grp -def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): - date_limit = dt.today() - timedelta(days = ndays) - lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts() - num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] - if num_unique_dates < nday_threshold: - nday_threshold = round((nday_threshold/ndays) * num_unique_dates) - lineages_to_retain = lineages_to_retain[lineages_to_retain >= nday_threshold].index.tolist() - lineages_to_retain.extend(keep_lineages) - df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain) - df = df.reset_index() - df.loc[:,"prevalence"] = df["lineage_count"]/df["total_count"] - return df +def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180): + + df['prevalence'] = df['total_count']/df['lineage_count'] + df = df.sort_values(by="date") #Sort date values + + if min_date and max_date: + df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)] + if keep_lineages != []: # still unsure about what this is for? + df = df.groupby(index_col).apply(classify_other_category, keep_lineages) + #or grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" only? + # should any and all lineages not in keep_lineages be called other or doesclassify() do other calculations? + + elif ndays and nday_threshold: + if df["date"].iloc[-1] < dt.today(): #Will not work if ndays is outside of data date range + date = df["date"].iloc[-1] + else: + date = dt.today() + + date_limit = date - timedelta(days = ndays) + df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_limit) & (df['date'] < date)] + + #want to select data between today and timedelta + num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] + if num_unique_dates < nday_threshold: # what's this for? how it relate to threshold? + nday_threshold = round((nday_threshold/ndays) * num_unique_dates) + if keep_lineages != []: + df = df.groupby(index_col).apply(classify_other_category, keep_lineages) + + return df + + def parse_location_id_to_query(query_id, query_obj = None): if query_id == None: From 546ad8b5352672182b79b0c93826e1ef95d1a443 Mon Sep 17 00:00:00 2001 From: Sarah Randall Date: Tue, 29 Aug 2023 19:37:10 -0700 Subject: [PATCH 2/5] updating get_major_lineage_prevalence --- web/handlers/genomics/util.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index ce5e451c..8e6ed06c 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -2,23 +2,6 @@ from scipy.stats import beta import pandas as pd -import os -os.chdir('/Users/sarahrandall/Downloads') - -data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True) - -# min_date="2022-03-15" -# max_date="2022-03-20" -# prevalence_threshold = 0.05 - -# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)] -#index_col should always be "date" - - - - -# # data['proportion'] = data['proportion'].apply(lambda x: x*100) - def calculate_proportion(_x, _n): x = _x.round() From 64daf212220681540b03b43118837a82947ac7c4 Mon Sep 17 00:00:00 2001 From: Sarah Randall Date: Tue, 29 Aug 2023 19:37:10 -0700 Subject: [PATCH 3/5] updating get_major_lineage_prevalence --- web/handlers/genomics/util.py | 69 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index ce5e451c..8534685a 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -2,23 +2,6 @@ from scipy.stats import beta import pandas as pd -import os -os.chdir('/Users/sarahrandall/Downloads') - -data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True) - -# min_date="2022-03-15" -# max_date="2022-03-20" -# prevalence_threshold = 0.05 - -# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)] -#index_col should always be "date" - - - - -# # data['proportion'] = data['proportion'].apply(lambda x: x*100) - def calculate_proportion(_x, _n): x = _x.round() @@ -226,31 +209,45 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da df['prevalence'] = df['total_count']/df['lineage_count'] df = df.sort_values(by="date") #Sort date values + min_date = dt.strptime(min_date, "%Y-%m-%d") + max_date = dt.strptime(max_date, "%Y-%m-%d") if min_date and max_date: df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)] - if keep_lineages != []: # still unsure about what this is for? - df = df.groupby(index_col).apply(classify_other_category, keep_lineages) - #or grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" only? - # should any and all lineages not in keep_lineages be called other or doesclassify() do other calculations? - - elif ndays and nday_threshold: - if df["date"].iloc[-1] < dt.today(): #Will not work if ndays is outside of data date range + + elif min_date: + date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) + df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)] + + elif max_date: + date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) + df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)] + + + if df["date"].iloc[-1] < dt.today(): date = df["date"].iloc[-1] - else: + else: date = dt.today() - - date_limit = date - timedelta(days = ndays) - df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_limit) & (df['date'] < date)] - - #want to select data between today and timedelta - num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] - if num_unique_dates < nday_threshold: # what's this for? how it relate to threshold? - nday_threshold = round((nday_threshold/ndays) * num_unique_dates) - if keep_lineages != []: - df = df.groupby(index_col).apply(classify_other_category, keep_lineages) + date_range = date - timedelta(days = ndays) + df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)] + + date_limit = date - timedelta(days = ndays) + num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] + if num_unique_dates < nday_threshold: + nday_threshold = round((nday_threshold/ndays) * num_unique_dates) + + date_range = date - timedelta(days = nday_threshold) #Finding lineages nday_threshold days in the date range + lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)] + lineages_to_retain = lineages_to_retain['lineage'].to_list() + keep_lineages.append(lineages_to_retain) + + df = df.groupby(index_col).apply(classify_other_category, keep_lineages) + + return df - return df + # In order to be considered important enough to not be grouped into the "other" lineage, + # a lineage needs to appear on at least nday_threshold days in the date range -- + # so num_unique_dates > nday_threshold. From 233ee00a45f66b1f288f5430e050a6e08d7cef0d Mon Sep 17 00:00:00 2001 From: Sarah Randall Date: Sun, 1 Oct 2023 21:38:01 -0700 Subject: [PATCH 4/5] fixed logic --- web/handlers/genomics/util.py | 46 ++++++++++------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index 8534685a..603fbf6e 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -209,48 +209,28 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da df['prevalence'] = df['total_count']/df['lineage_count'] df = df.sort_values(by="date") #Sort date values - min_date = dt.strptime(min_date, "%Y-%m-%d") - max_date = dt.strptime(max_date, "%Y-%m-%d") + if min_date and max_date: df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)] - + num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found elif min_date: - date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) + date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)] - - elif max_date: - date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) - df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)] - - - if df["date"].iloc[-1] < dt.today(): - date = df["date"].iloc[-1] + num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0] else: - date = dt.today() - date_range = date - timedelta(days = ndays) - df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)] - - date_limit = date - timedelta(days = ndays) - num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] - if num_unique_dates < nday_threshold: - nday_threshold = round((nday_threshold/ndays) * num_unique_dates) - - date_range = date - timedelta(days = nday_threshold) #Finding lineages nday_threshold days in the date range - lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)] - lineages_to_retain = lineages_to_retain['lineage'].to_list() - keep_lineages.append(lineages_to_retain) + date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back + df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)] + num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] - df = df.groupby(index_col).apply(classify_other_category, keep_lineages) - + if num_unique_dates < nday_threshold: + nday_threshold = round((nday_threshold/ndays) * num_unique_dates) + lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df + lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped + keep_lineages.extend(lineages_to_retain) + df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain) return df - # In order to be considered important enough to not be grouped into the "other" lineage, - # a lineage needs to appear on at least nday_threshold days in the date range -- - # so num_unique_dates > nday_threshold. - - - def parse_location_id_to_query(query_id, query_obj = None): if query_id == None: return None From 849427bf046c0bcc74089c1daec7eac05135a043 Mon Sep 17 00:00:00 2001 From: Sarah Randall Date: Sun, 15 Oct 2023 18:03:27 -0700 Subject: [PATCH 5/5] final changes --- web/handlers/genomics/util.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py index 603fbf6e..fbb2873e 100644 --- a/web/handlers/genomics/util.py +++ b/web/handlers/genomics/util.py @@ -210,22 +210,20 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da df['prevalence'] = df['total_count']/df['lineage_count'] df = df.sort_values(by="date") #Sort date values - if min_date and max_date: - df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)] - num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found + df = df[(df["date"].between(min_date, max_date))] elif min_date: date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward - df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)] - num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0] + df = df[(df['date'] >= min_date) & (df['date'] <= date_limit)] else: date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back - df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)] - num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0] + df = df[(df['date'] <= max_date) & (df['date'] >= date_limit)] + + num_unique_dates = df["date"].unique().shape[0] #counts # of unique days lineage is found if num_unique_dates < nday_threshold: nday_threshold = round((nday_threshold/ndays) * num_unique_dates) - lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df + lineage_counts = df[(df["prevalence"] >= prevalence_threshold)]["lineage"].value_counts() #number of times lineage is found in df lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped keep_lineages.extend(lineages_to_retain) df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain) @@ -243,7 +241,7 @@ def parse_location_id_to_query(query_id, query_obj = None): } location_types = ["country_id", "division_id", "location_id"] for i in range(min(3, len(location_codes))): - if i == 1 and len(location_codes[i].split("-")) > 1: # For division remove iso2 code if present + if i == 1 and len(location_codes[i].split("-")) > 1: # For division remove iso2 code if present location_codes[i] = location_codes[i].split("-")[1] if "must" in query_obj["bool"]: query_obj["bool"]["must"].append({