From 8c1224671bc88fe490619ab90c77eb81502ee691 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Simonard Date: Tue, 27 Aug 2024 17:44:17 +0200 Subject: [PATCH] Fix for "None" values in the variables --- web/dashboard/app.py | 21 ++++++++++++++------- web/dashboard/main_dashboard.py | 21 ++++++++++----------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/web/dashboard/app.py b/web/dashboard/app.py index 3a828f33..267b1fc8 100644 --- a/web/dashboard/app.py +++ b/web/dashboard/app.py @@ -40,14 +40,21 @@ def load_data(): raw_data = raw_data[raw_data.year >= 2000] # necessary conversion to tuples, which is hashable type - # needed for grouping - for col in ["funder", "data_tags"]: - raw_data[col] = raw_data[col].apply(lambda x: tuple(x)) + # needed for grouping. + # Also removes duplicates and removes leading and trailing spaces in values. + # Also replaces empty lists with ("None", ) to simplify the filtering and grouping in the dashboard + for col in ["funder", "affiliation_country", "data_tags"]: + raw_data[col] = raw_data[col].apply( + lambda x: ("None",) + if (len(x) == 0 or len(x) == 1 and x[0] == "") + else tuple(set([v.strip() for v in x])) + ) - # convert to tuple, remove duplicates and remove leading and trailing spaces in countries names - raw_data["affiliation_country"] = raw_data["affiliation_country"].apply( - lambda x: tuple(set([v.strip() for v in x])) - ) + # Filter out some distracting weird data + raw_data = raw_data[ + raw_data["journal"] + != "Acta Crystallographica Section E: Structure Reports Online" + ] return raw_data diff --git a/web/dashboard/main_dashboard.py b/web/dashboard/main_dashboard.py index c98beab8..42630c87 100644 --- a/web/dashboard/main_dashboard.py +++ b/web/dashboard/main_dashboard.py @@ -224,7 +224,7 @@ def did_change_extraction_tool(self): ## affiliation country countries_with_count = self.get_col_values_with_count( - "affiliation_country", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" + "affiliation_country", lambda x: "None" in x ) def country_sorter(c): @@ -236,7 +236,7 @@ def country_sorter(c): ## funder funders_with_count = self.get_col_values_with_count( - "funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" + "funder", lambda x: "None" in x ) def funder_sorter(c): @@ -248,7 +248,7 @@ def funder_sorter(c): ## Tags tags_with_count = self.get_col_values_with_count( - "data_tags", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" + "data_tags", lambda x: "None" in x ) def tags_sorter(c): @@ -300,7 +300,7 @@ def did_change_splitting_var(self): # We want to show all countries, but pre-select only the top 10 countries_with_count = self.get_col_values_with_count( "affiliation_country", - lambda x: len(x) == 0 or len(x) == 1 and x[0] == "", + lambda x: "None" in x, ) # pre-filter the countries because there are a lot @@ -331,7 +331,7 @@ def did_change_splitting_var(self): if splitting_var == "funder": # We want to show all funders, but pre-select only the top 10 funders_with_count = self.get_col_values_with_count( - "funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" + "funder", lambda x: "None" in x ) top_5_min = sorted( @@ -400,7 +400,7 @@ def filtered_grouped_data(self): # the filter on countries is a bit different as the rows # are list of countries def country_filter(cell): - if cell is None: + if len(cell) == 0 or len(cell) == 1 and cell[0] == "": return "None" in self.filter_affiliation_country return any(c in self.filter_affiliation_country for c in cell) @@ -424,22 +424,22 @@ def funder_filter(cell): ): # the filter on tags is similar to the filter on countries def tags_filter(cell): - if cell is None: + if len(cell) == 0 or len(cell) == 1 and cell[0] == "": return "None" in self.filter_tags return any(c in self.filter_tags for c in cell) filtered_df = filtered_df[filtered_df.data_tags.apply(tags_filter)] - aggretations = {} + aggregations = {} for field, aggs in dims_aggregations.items(): for agg in aggs: - aggretations[f"{agg}_{field}"] = (field, aggregation_formulas[agg]) + aggregations[f"{agg}_{field}"] = (field, aggregation_formulas[agg]) groupers = ["year"] if self.splitting_var != "None": groupers.append(self.splitting_var_from_label(self.splitting_var)) - result = filtered_df.groupby(groupers).agg(**aggretations).reset_index() + result = filtered_df.groupby(groupers).agg(**aggregations).reset_index() print("FILTERED_GROUPED_DATA_DONE", len(result)) @@ -705,7 +705,6 @@ def update_pubdate_slider(event): ) def did_click_shortcut_button(event): - print(event) if event.obj.name == "Last year": self.start_pubdate_input.value, self.end_pubdate_input.value = ( str(datetime.now().year),