From 1b9833415c0bea4e623ed190df515fc7311bcd39 Mon Sep 17 00:00:00 2001
From: Sarah Randall <srandall02@icloud.com>
Date: Thu, 24 Aug 2023 21:03:46 -0700
Subject: [PATCH 1/5] added date params

---
 web/handlers/genomics/util.py | 64 +++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
index 2eaeb36d..ce5e451c 100644
--- a/web/handlers/genomics/util.py
+++ b/web/handlers/genomics/util.py
@@ -2,6 +2,24 @@
 from scipy.stats import beta
 import pandas as pd
 
+import os
+os.chdir('/Users/sarahrandall/Downloads')
+
+data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True)
+
+# min_date="2022-03-15"
+# max_date="2022-03-20"
+# prevalence_threshold = 0.05
+
+# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)]
+#index_col should always be "date"
+
+
+
+
+# # data['proportion'] = data['proportion'].apply(lambda x: x*100)
+
+
 def calculate_proportion(_x, _n):
     x = _x.round()
     n = _n.round()
@@ -19,7 +37,6 @@ def expand_dates(df, date_min, date_max, index_col, grp_col):
         df
         .set_index(index_col)
         .reindex(idx, fill_value = 0)
-        .drop(grp_col, axis = 1)
         .reset_index()
         .rename(
             columns = {
@@ -197,7 +214,7 @@ def create_nested_mutation_query(location_id = None, lineages = [], mutations =
     parse_location_id_to_query(location_id, query_obj)
     return query_obj
 
-def classify_other_category(grp, keep_lineages):
+def classify_other_category(grp, keep_lineages): # Understood as ignores any lineages user want to keep
     grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" # Temporarily remove none. TODO: Proper fix
     grp = grp.groupby("lineage").agg({
         "total_count": lambda x: x.iloc[0],
@@ -205,18 +222,37 @@ def classify_other_category(grp, keep_lineages):
     })
     return grp
 
-def get_major_lineage_prevalence(df, index_col, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):
-    date_limit = dt.today() - timedelta(days = ndays)
-    lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df["date"] >= date_limit)]["lineage"].value_counts()
-    num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
-    if num_unique_dates < nday_threshold:
-        nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
-    lineages_to_retain = lineages_to_retain[lineages_to_retain >= nday_threshold].index.tolist()
-    lineages_to_retain.extend(keep_lineages)
-    df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain)
-    df = df.reset_index()
-    df.loc[:,"prevalence"] = df["lineage_count"]/df["total_count"]
-    return df
+def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_date = None, keep_lineages = [], prevalence_threshold = 0.05, nday_threshold = 10, ndays = 180):
+   
+    df['prevalence'] = df['total_count']/df['lineage_count']
+    df = df.sort_values(by="date") #Sort date values
+    
+    if min_date and max_date:
+        df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)]
+        if keep_lineages != []: # still unsure about what this is for?
+            df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
+            #or  grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" only?
+            # should any and all lineages not in keep_lineages be called other or doesclassify() do other calculations?
+            
+    elif ndays and nday_threshold:
+        if df["date"].iloc[-1] < dt.today(): #Will not work if ndays is outside of data date range
+            date = df["date"].iloc[-1]
+        else:
+            date = dt.today()
+            
+        date_limit = date - timedelta(days = ndays)
+        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_limit) & (df['date'] < date)]
+             
+        #want to select data between today and timedelta
+        num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
+        if num_unique_dates < nday_threshold: # what's this for? how it relate to threshold?
+             nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
+             if keep_lineages != []:
+                 df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
+
+        return df
+    
+
 
 def parse_location_id_to_query(query_id, query_obj = None):
     if query_id == None:

From 546ad8b5352672182b79b0c93826e1ef95d1a443 Mon Sep 17 00:00:00 2001
From: Sarah Randall <srandall02@icloud.com>
Date: Tue, 29 Aug 2023 19:37:10 -0700
Subject: [PATCH 2/5] updating get_major_lineage_prevalence

---
 web/handlers/genomics/util.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
index ce5e451c..8e6ed06c 100644
--- a/web/handlers/genomics/util.py
+++ b/web/handlers/genomics/util.py
@@ -2,23 +2,6 @@
 from scipy.stats import beta
 import pandas as pd
 
-import os
-os.chdir('/Users/sarahrandall/Downloads')
-
-data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True)
-
-# min_date="2022-03-15"
-# max_date="2022-03-20"
-# prevalence_threshold = 0.05
-
-# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)]
-#index_col should always be "date"
-
-
-
-
-# # data['proportion'] = data['proportion'].apply(lambda x: x*100)
-
 
 def calculate_proportion(_x, _n):
     x = _x.round()

From 64daf212220681540b03b43118837a82947ac7c4 Mon Sep 17 00:00:00 2001
From: Sarah Randall <srandall02@icloud.com>
Date: Tue, 29 Aug 2023 19:37:10 -0700
Subject: [PATCH 3/5] updating get_major_lineage_prevalence

---
 web/handlers/genomics/util.py | 69 +++++++++++++++++------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
index ce5e451c..8534685a 100644
--- a/web/handlers/genomics/util.py
+++ b/web/handlers/genomics/util.py
@@ -2,23 +2,6 @@
 from scipy.stats import beta
 import pandas as pd
 
-import os
-os.chdir('/Users/sarahrandall/Downloads')
-
-data = pd.read_json("prevalence-by-location-all-lineages-test-case.jsonl.gz", lines=True)
-
-# min_date="2022-03-15"
-# max_date="2022-03-20"
-# prevalence_threshold = 0.05
-
-# data = data[(data["date"].between(min_date, max_date)) & (data["prevalence"] >= prevalence_threshold)]
-#index_col should always be "date"
-
-
-
-
-# # data['proportion'] = data['proportion'].apply(lambda x: x*100)
-
 
 def calculate_proportion(_x, _n):
     x = _x.round()
@@ -226,31 +209,45 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da
    
     df['prevalence'] = df['total_count']/df['lineage_count']
     df = df.sort_values(by="date") #Sort date values
+    min_date = dt.strptime(min_date, "%Y-%m-%d")
+    max_date = dt.strptime(max_date, "%Y-%m-%d")
     
     if min_date and max_date:
         df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)]
-        if keep_lineages != []: # still unsure about what this is for?
-            df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
-            #or  grp.loc[(~grp["lineage"].isin(keep_lineages)) | (grp["lineage"] == "none"), "lineage"] = "other" only?
-            # should any and all lineages not in keep_lineages be called other or doesclassify() do other calculations?
-            
-    elif ndays and nday_threshold:
-        if df["date"].iloc[-1] < dt.today(): #Will not work if ndays is outside of data date range
+              
+    elif min_date:
+        date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays)
+        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)]
+        
+    elif max_date:
+        date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays)
+        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)]
+    
+        
+    if df["date"].iloc[-1] < dt.today():
             date = df["date"].iloc[-1]
-        else:
+    else:
             date = dt.today()
-            
-        date_limit = date - timedelta(days = ndays)
-        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_limit) & (df['date'] < date)]
-             
-        #want to select data between today and timedelta
-        num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
-        if num_unique_dates < nday_threshold: # what's this for? how it relate to threshold?
-             nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
-             if keep_lineages != []:
-                 df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
+            date_range = date - timedelta(days = ndays)
+            df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)]
+    
+    date_limit = date - timedelta(days = ndays)
+    num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
+    if num_unique_dates < nday_threshold: 
+         nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
+         
+    date_range = date - timedelta(days = nday_threshold) #Finding lineages nday_threshold days in the date range
+    lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)]
+    lineages_to_retain = lineages_to_retain['lineage'].to_list()
+    keep_lineages.append(lineages_to_retain)
+    
+    df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
+
+    return df
 
-        return df
+ # In order to be considered important enough to not be grouped into the "other" lineage, 
+ # a lineage needs to appear on at least nday_threshold days in the date range -- 
+ # so num_unique_dates > nday_threshold.
     
 
 

From 233ee00a45f66b1f288f5430e050a6e08d7cef0d Mon Sep 17 00:00:00 2001
From: Sarah Randall <srandall02@icloud.com>
Date: Sun, 1 Oct 2023 21:38:01 -0700
Subject: [PATCH 4/5] fixed logic

---
 web/handlers/genomics/util.py | 46 ++++++++++-------------------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
index 8534685a..603fbf6e 100644
--- a/web/handlers/genomics/util.py
+++ b/web/handlers/genomics/util.py
@@ -209,48 +209,28 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da
    
     df['prevalence'] = df['total_count']/df['lineage_count']
     df = df.sort_values(by="date") #Sort date values
-    min_date = dt.strptime(min_date, "%Y-%m-%d")
-    max_date = dt.strptime(max_date, "%Y-%m-%d")
+    
     
     if min_date and max_date:
         df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)]
-              
+        num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found
     elif min_date:
-        date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays)
+        date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward
         df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)]
-        
-    elif max_date:
-        date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays)
-        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)]
-    
-        
-    if df["date"].iloc[-1] < dt.today():
-            date = df["date"].iloc[-1]
+        num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0] 
     else:
-            date = dt.today()
-            date_range = date - timedelta(days = ndays)
-            df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)]
-    
-    date_limit = date - timedelta(days = ndays)
-    num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
-    if num_unique_dates < nday_threshold: 
-         nday_threshold = round((nday_threshold/ndays) * num_unique_dates)
-         
-    date_range = date - timedelta(days = nday_threshold) #Finding lineages nday_threshold days in the date range
-    lineages_to_retain = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > date_range) & (df['date'] < date)]
-    lineages_to_retain = lineages_to_retain['lineage'].to_list()
-    keep_lineages.append(lineages_to_retain)
+        date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back
+        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)]
+        num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
     
-    df = df.groupby(index_col).apply(classify_other_category, keep_lineages)
-
+    if num_unique_dates < nday_threshold:
+        nday_threshold = round((nday_threshold/ndays) * num_unique_dates) 
+    lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df
+    lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped
+    keep_lineages.extend(lineages_to_retain)
+    df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain)
     return df
 
- # In order to be considered important enough to not be grouped into the "other" lineage, 
- # a lineage needs to appear on at least nday_threshold days in the date range -- 
- # so num_unique_dates > nday_threshold.
-    
-
-
 def parse_location_id_to_query(query_id, query_obj = None):
     if query_id == None:
         return None

From 849427bf046c0bcc74089c1daec7eac05135a043 Mon Sep 17 00:00:00 2001
From: Sarah Randall <srandall02@icloud.com>
Date: Sun, 15 Oct 2023 18:03:27 -0700
Subject: [PATCH 5/5] final changes

---
 web/handlers/genomics/util.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/web/handlers/genomics/util.py b/web/handlers/genomics/util.py
index 603fbf6e..fbb2873e 100644
--- a/web/handlers/genomics/util.py
+++ b/web/handlers/genomics/util.py
@@ -210,22 +210,20 @@ def get_major_lineage_prevalence(df, index_col = "date", min_date = None, max_da
     df['prevalence'] = df['total_count']/df['lineage_count']
     df = df.sort_values(by="date") #Sort date values
     
-    
     if min_date and max_date:
-        df = df[(df["date"].between(min_date, max_date)) & (df["prevalence"] >= prevalence_threshold)]
-        num_unique_dates = df[df["date"] >= min_date]["date"].unique().shape[0] #counts # of unique days lineage is found
+        df = df[(df["date"].between(min_date, max_date))]
     elif min_date:
         date_limit = dt.strptime(min_date, "%Y-%m-%d") + timedelta(days=ndays) # searches from min_date to ndays forward
-        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] > min_date) & (df['date'] < date_limit)]
-        num_unique_dates = df[df["date"] <= date_limit]["date"].unique().shape[0] 
+        df = df[(df['date'] >= min_date) & (df['date'] <= date_limit)]
     else:
         date_limit = dt.strptime(max_date, "%Y-%m-%d") - timedelta(days=ndays) # searches from max_date to ndays back
-        df = df[(df["prevalence"] >= prevalence_threshold) & (df['date'] < max_date) & (df['date'] > date_limit)]
-        num_unique_dates = df[df["date"] >= date_limit]["date"].unique().shape[0]
+        df = df[(df['date'] <= max_date) & (df['date'] >= date_limit)]
+        
+    num_unique_dates = df["date"].unique().shape[0]  #counts # of unique days lineage is found
     
     if num_unique_dates < nday_threshold:
         nday_threshold = round((nday_threshold/ndays) * num_unique_dates) 
-    lineage_counts = df["lineage"].value_counts() #number of times lineage is found in df
+    lineage_counts = df[(df["prevalence"] >= prevalence_threshold)]["lineage"].value_counts() #number of times lineage is found in df
     lineages_to_retain = lineage_counts[lineage_counts >= nday_threshold].index.to_list() #lineages found at least [nday_threshold] times won't be grouped
     keep_lineages.extend(lineages_to_retain)
     df = df.groupby(index_col).apply(classify_other_category, lineages_to_retain)
@@ -243,7 +241,7 @@ def parse_location_id_to_query(query_id, query_obj = None):
         }
     location_types = ["country_id", "division_id", "location_id"]
     for i in range(min(3, len(location_codes))):
-        if i == 1 and len(location_codes[i].split("-")) > 1:              # For division remove iso2 code if present
+        if i == 1 and len(location_codes[i].split("-")) > 1:  # For division remove iso2 code if present
             location_codes[i] = location_codes[i].split("-")[1]
         if "must" in query_obj["bool"]:
             query_obj["bool"]["must"].append({