From 8c1224671bc88fe490619ab90c77eb81502ee691 Mon Sep 17 00:00:00 2001
From: Pierre-Olivier Simonard <pierre.olivier.simonard@gmail.com>
Date: Tue, 27 Aug 2024 17:44:17 +0200
Subject: [PATCH] Fix for "None" values in the variables

---
 web/dashboard/app.py            | 21 ++++++++++++++-------
 web/dashboard/main_dashboard.py | 21 ++++++++++-----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/web/dashboard/app.py b/web/dashboard/app.py
index 3a828f33..267b1fc8 100644
--- a/web/dashboard/app.py
+++ b/web/dashboard/app.py
@@ -40,14 +40,21 @@ def load_data():
     raw_data = raw_data[raw_data.year >= 2000]
 
     # necessary conversion to tuples, which is hashable type
-    # needed for grouping
-    for col in ["funder", "data_tags"]:
-        raw_data[col] = raw_data[col].apply(lambda x: tuple(x))
+    # needed for grouping.
+    # Also removes duplicates and removes leading and trailing spaces in values.
+    # Also replaces empty lists with ("None", ) to simplify the filtering and grouping in the dashboard
+    for col in ["funder", "affiliation_country", "data_tags"]:
+        raw_data[col] = raw_data[col].apply(
+            lambda x: ("None",)
+            if (len(x) == 0 or len(x) == 1 and x[0] == "")
+            else tuple(set([v.strip() for v in x]))
+        )
 
-    # convert to tuple, remove duplicates and remove leading and trailing spaces in countries names
-    raw_data["affiliation_country"] = raw_data["affiliation_country"].apply(
-        lambda x: tuple(set([v.strip() for v in x]))
-    )
+    # Filter out some distracting weird data
+    raw_data = raw_data[
+        raw_data["journal"]
+        != "Acta Crystallographica Section E: Structure Reports Online"
+    ]
 
     return raw_data
 
diff --git a/web/dashboard/main_dashboard.py b/web/dashboard/main_dashboard.py
index c98beab8..42630c87 100644
--- a/web/dashboard/main_dashboard.py
+++ b/web/dashboard/main_dashboard.py
@@ -224,7 +224,7 @@ def did_change_extraction_tool(self):
 
         ## affiliation country
         countries_with_count = self.get_col_values_with_count(
-            "affiliation_country", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
+            "affiliation_country", lambda x: "None" in x
         )
 
         def country_sorter(c):
@@ -236,7 +236,7 @@ def country_sorter(c):
 
         ## funder
         funders_with_count = self.get_col_values_with_count(
-            "funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
+            "funder", lambda x: "None" in x
         )
 
         def funder_sorter(c):
@@ -248,7 +248,7 @@ def funder_sorter(c):
 
         ## Tags
         tags_with_count = self.get_col_values_with_count(
-            "data_tags", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
+            "data_tags", lambda x: "None" in x
         )
 
         def tags_sorter(c):
@@ -300,7 +300,7 @@ def did_change_splitting_var(self):
             # We want to show all countries, but pre-select only the top 10
             countries_with_count = self.get_col_values_with_count(
                 "affiliation_country",
-                lambda x: len(x) == 0 or len(x) == 1 and x[0] == "",
+                lambda x: "None" in x,
             )
 
             # pre-filter the countries because there are a lot
@@ -331,7 +331,7 @@ def did_change_splitting_var(self):
         if splitting_var == "funder":
             # We want to show all funders, but pre-select only the top 10
             funders_with_count = self.get_col_values_with_count(
-                "funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
+                "funder", lambda x: "None" in x
             )
 
             top_5_min = sorted(
@@ -400,7 +400,7 @@ def filtered_grouped_data(self):
             # the filter on countries is a bit different as the rows
             # are list of countries
             def country_filter(cell):
-                if cell is None:
+                if len(cell) == 0 or len(cell) == 1 and cell[0] == "":
                     return "None" in self.filter_affiliation_country
                 return any(c in self.filter_affiliation_country for c in cell)
 
@@ -424,22 +424,22 @@ def funder_filter(cell):
         ):
             # the filter on tags is similar to the filter on countries
             def tags_filter(cell):
-                if cell is None:
+                if len(cell) == 0 or len(cell) == 1 and cell[0] == "":
                     return "None" in self.filter_tags
                 return any(c in self.filter_tags for c in cell)
 
             filtered_df = filtered_df[filtered_df.data_tags.apply(tags_filter)]
 
-        aggretations = {}
+        aggregations = {}
         for field, aggs in dims_aggregations.items():
             for agg in aggs:
-                aggretations[f"{agg}_{field}"] = (field, aggregation_formulas[agg])
+                aggregations[f"{agg}_{field}"] = (field, aggregation_formulas[agg])
 
         groupers = ["year"]
         if self.splitting_var != "None":
             groupers.append(self.splitting_var_from_label(self.splitting_var))
 
-        result = filtered_df.groupby(groupers).agg(**aggretations).reset_index()
+        result = filtered_df.groupby(groupers).agg(**aggregations).reset_index()
 
         print("FILTERED_GROUPED_DATA_DONE", len(result))
 
@@ -705,7 +705,6 @@ def update_pubdate_slider(event):
         )
 
         def did_click_shortcut_button(event):
-            print(event)
             if event.obj.name == "Last year":
                 self.start_pubdate_input.value, self.end_pubdate_input.value = (
                     str(datetime.now().year),