From 946461958959a9ffad2f4fdbbb0c880a929f2232 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Simonard Date: Mon, 26 Aug 2024 11:54:29 +0200 Subject: [PATCH] Handling funders + various improvements --- web/dashboard/app.py | 9 ++ web/dashboard/main_dashboard.py | 178 ++++++++++++++++++++++++-------- 2 files changed, 143 insertions(+), 44 deletions(-) diff --git a/web/dashboard/app.py b/web/dashboard/app.py index d5b2326f..1677fe65 100644 --- a/web/dashboard/app.py +++ b/web/dashboard/app.py @@ -36,6 +36,14 @@ def load_data(): raw_data = tb.to_pandas() raw_data["metrics"] = "RTransparent" raw_data = raw_data[raw_data.year >= 2000] + + # necessary conversion to tuples, which is hashable type + # needed for grouping + raw_data.affiliation_country = raw_data.affiliation_country.apply( + lambda cntry: tuple(cntry) + ) + raw_data.funder = raw_data.funder.apply(lambda fndrs: tuple(fndrs)) + return raw_data @@ -79,6 +87,7 @@ def get_template(self): "css/global/vars.css", "css/global/flat.css", "css/global/intro.css", + "css/global/vars.css", ], ) # diff --git a/web/dashboard/main_dashboard.py b/web/dashboard/main_dashboard.py index 1a670175..0c49331d 100644 --- a/web/dashboard/main_dashboard.py +++ b/web/dashboard/main_dashboard.py @@ -24,7 +24,7 @@ "None", "journal", "affiliation_country", - "fund_pmc_institute", + "funder", ], } } @@ -67,8 +67,7 @@ class MainDashboard(param.Parameterized): metrics = param.Selector(default=[], objects=[], label="Metrics") splitting_var = param.Selector( - default="year", - objects=["year", "fund_pmc_institute"], + objects=[], label="Splitting Variable", ) @@ -81,6 +80,8 @@ class MainDashboard(param.Parameterized): default=[], objects=[], label="Country" ) + filter_funder = param.ListSelector(default=[], objects=[], label="Funder") + # UI elements echarts_pane = pn.pane.ECharts( {}, height=640, width=960, renderer="svg", options={"replaceMerge": ["series"]} @@ -112,6 +113,13 @@ def __init__(self, datasets, **params): ), ) + self.funder_select_picker = SelectPicker.from_param( + self.param.filter_funder, + update_title_callback=lambda select_picker, + values, + options: self.new_picker_title("funders", select_picker, values, options), + ) + self.build_pubdate_filter() @pn.depends("extraction_tool", watch=True) @@ -158,7 +166,6 @@ def did_change_extraction_tool(self): self.param.filter_journal.objects = self.raw_data.journal.unique() ## affiliation country - ## Keeping "None" as a string on purpose, to represent it in the SelectPicker countries_with_count = self.get_countries_with_count() def country_sorter(c): @@ -168,15 +175,38 @@ def country_sorter(c): countries_with_count.keys(), key=country_sorter, reverse=True ) + ## funder + funders_with_count = self.get_funders_with_count() + + def funder_sorter(c): + return funders_with_count[c] + + self.param.filter_funder.objects = sorted( + funders_with_count.keys(), key=funder_sorter, reverse=True + ) + # This triggers function "did_change_splitting_var" - # which updates the filter_journal and filter_affiliation_country + # which updates filter_journal, filter_affiliation_country and filter_funder self.splitting_var = self.param.splitting_var.objects[0] + @lru_cache + def get_funders_with_count(self): + funders = {} + for row in self.raw_data.funder.values: + if len(row) == 0 or len(row) == 1 and row[0] == "": + ## Keeping "None" as a string on purpose, to represent it in the SelectPicker + funders["None"] = funders.get("None", 0) + 1 + else: + for c in row: + funders[c] = funders.get(c, 0) + 1 + return funders + @lru_cache def get_countries_with_count(self): countries = {} for row in self.raw_data.affiliation_country.values: if row is None: + ## Keeping "None" as a string on purpose, to represent it in the SelectPicker countries["None"] = countries.get("None", 0) + 1 else: for c in row: @@ -187,18 +217,27 @@ def get_countries_with_count(self): def did_change_splitting_var(self): print("DID_CHANGE_SPLITTING_VAR", self.splitting_var) + # already set the echarts pane as loading for a better UX + self.echarts_pane.loading = True + + notif_msg = None if self.splitting_var == "journal": # We want to show all journals, but pre-select only the top 10 - self.filter_journal = list( - self.raw_data.journal.value_counts().iloc[:10].index + selected_journals = list( + self.raw_data.query("journal != 'None'") + .journal.value_counts() + .iloc[:10] + .index ) notif_msg = "Splitting by journal. Top 10 journals selected by default." else: - self.filter_journal = self.param.filter_journal.objects + selected_journals = self.param.filter_journal.objects if self.splitting_var == "affiliation_country": # We want to show all countries, but pre-select only the top 10 countries_with_count = self.get_countries_with_count() + + # pre-filter the countries because there are a lot countries_with_count = { country: count for country, count in countries_with_count.items() @@ -206,25 +245,61 @@ def did_change_splitting_var(self): } top_10_min = sorted( - [count for _, count in countries_with_count.items()], reverse=True + [ + count + for country, count in countries_with_count.items() + if country != "None" + ], + reverse=True, )[10] selected_countries = [ country for country, count in countries_with_count.items() - if count >= top_10_min + if count > top_10_min and country != "None" ] - self.filter_affiliation_country = selected_countries notif_msg = "Splitting by affiliation country. Top 10 countries selected by default." else: - self.filter_affiliation_country = ( - self.param.filter_affiliation_country.objects - ) + selected_countries = self.param.filter_affiliation_country.objects + + if self.splitting_var == "funder": + # We want to show all funders, but pre-select only the top 10 + funders_with_count = self.get_funders_with_count() + + top_5_min = sorted( + [ + count + for funder, count in funders_with_count.items() + if funder != "None" + ], + reverse=True, + )[5] + selected_funders = [ + funder + for funder, count in funders_with_count.items() + if count > top_5_min and funder != "None" + ] + + notif_msg = "Splitting by funder. Top 5 Funders selected by default." + else: + selected_funders = self.param.filter_funder.objects + + # Trigger a batch update of the filters value, + # preventing from re-rendering the dashboard several times + # and preventing intermediate states where the dashboard renders onces + # with all funders for instance, and then restricting on the selected funders. + print("TRIGGER UPDATE") + self.param.update( + filter_journal=selected_journals, + filter_affiliation_country=selected_countries, + filter_funder=selected_funders, + ) if self.splitting_var == "None": notif_msg = "No more splitting. Filters reset to default" - pn.state.notifications.info(notif_msg, duration=5000) + if notif_msg is not None: + pn.state.notifications.info(notif_msg, duration=5000) def filtered_grouped_data(self): print("FILTERED_GROUPED_DATA") @@ -256,6 +331,17 @@ def country_filter(cell): filtered_df.affiliation_country.apply(country_filter) ] + if len(filtered_df) > 0 and len(self.filter_funder) != len( + self.param.filter_funder.objects + ): + # the filter on funders is similar to the filter on countries + def funder_filter(cell): + if len(cell) == 0 or len(cell) == 1 and cell[0] == "": + return "None" in self.filter_funder + return any(c in self.filter_funder for c in cell) + + filtered_df = filtered_df[filtered_df.funder.apply(funder_filter)] + aggretations = {} for field, aggs in dims_aggregations.items(): for agg in aggs: @@ -270,11 +356,11 @@ def country_filter(cell): return result @pn.depends( - # "splitting_var", - "filter_pubdate", "metrics", + "filter_pubdate", "filter_affiliation_country", "filter_journal", + "filter_funder", watch=True, ) def updated_echart_plot(self): @@ -317,10 +403,10 @@ def updated_echart_plot(self): splitting_var_column = "affiliation_country" splitting_var_query = lambda cell, selected_item: selected_item in cell - elif self.splitting_var == "fund_pmc_institute": - splitting_var_filter = self.filter_fund_pmc_institute - splitting_var_column = "fund_pmc_institute" - splitting_var_query = lambda cell, selected_item: cell == selected_item + elif self.splitting_var == "funder": + splitting_var_filter = self.filter_funder + splitting_var_column = "funder" + splitting_var_query = lambda cell, selected_item: selected_item in cell else: print("Defaulting to splitting var 'journal' ") splitting_var_filter = self.filter_journal @@ -329,28 +415,31 @@ def updated_echart_plot(self): for selected_item in sorted(splitting_var_filter): # sub_df = df.query(f"{splitting_var_column} == '{selected_item}'") - sub_df = ( - df[ - df[splitting_var_column].apply( - lambda x: splitting_var_query(x, selected_item) - ) - ] - .groupby("year") - .agg({raw_metric: "mean"}) # todo fix this - .reset_index() - ) - series.append( - { - "id": selected_item, - "name": selected_item, - "type": "line", - "data": sub_df[raw_metric].tolist(), - } - ) - legend_data.append( - {"name": selected_item, "icon": "path://M 0 0 H 20 V 20 H 0 Z"} - ) + sub_df = df[ + df[splitting_var_column].apply( + lambda x: splitting_var_query(x, selected_item) + ) + ] + + if len(sub_df) > 0: + sub_df = ( + sub_df.groupby("year") + .agg({raw_metric: "mean"}) # todo fix this + .reset_index() + ) + + series.append( + { + "id": selected_item, + "name": selected_item, + "type": "line", + "data": sub_df[raw_metric].tolist(), + } + ) + legend_data.append( + {"name": selected_item, "icon": "path://M 0 0 H 20 V 20 H 0 Z"} + ) echarts_config = { "title": { @@ -394,8 +483,8 @@ def updated_echart_plot(self): def build_pubdate_filter(self): print("BUILD_PUBDATE_FILTER") - # The text input only reflect and update the values of the slider bounds - # self.pubdate_slider = pn.widgets.RangeSlider.from_param(self.param.filter_pubdate) + # The text input only reflect the values of the slider bounds, + # and update the slider bounds when their text value is changed. self.pubdate_slider = pn.widgets.IntRangeSlider( start=int(self.param.filter_pubdate.bounds[0]), end=int(self.param.filter_pubdate.bounds[1]), @@ -504,6 +593,7 @@ def get_sidebar(self): divider(), self.journal_select_picker, self.affiliation_country_select_picker, + self.funder_select_picker, ] sidebar = pn.Column(*items)