diff --git a/.gitattributes b/.gitattributes index 6e6a3da8..6ccbee3d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,4 +6,3 @@ # Enable binary delta compression for PNG files *.png -delta - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 605efde9..9248b113 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,4 @@ repos: - - repo: https://github.com/psf/black - rev: 24.4.2 - hooks: - - id: black - - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.5.0 diff --git a/pyproject.toml b/pyproject.toml index 1808d06b..f13b6a4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ keywords = [ dynamic = ["version"] dependencies = [ "dill", + "colorcet", "pandas", "pyarrow", "pydantic", diff --git a/web/dashboard/app.py b/web/dashboard/app.py index cd8af205..3a828f33 100644 --- a/web/dashboard/app.py +++ b/web/dashboard/app.py @@ -41,9 +41,14 @@ def load_data(): # necessary conversion to tuples, which is hashable type # needed for grouping - for col in ["affiliation_country", "funder", "data_tags"]: + for col in ["funder", "data_tags"]: raw_data[col] = raw_data[col].apply(lambda x: tuple(x)) + # convert to tuple, remove duplicates and remove leading and trailing spaces in countries names + raw_data["affiliation_country"] = raw_data["affiliation_country"].apply( + lambda x: tuple(set([v.strip() for v in x])) + ) + return raw_data diff --git a/web/dashboard/main_dashboard.py b/web/dashboard/main_dashboard.py index 749eb6ef..c98beab8 100644 --- a/web/dashboard/main_dashboard.py +++ b/web/dashboard/main_dashboard.py @@ -32,12 +32,22 @@ "funder", "data_tags", ], + "labels": { + "None": "None", + "journal": "Journal", + "affiliation_country": "Country", + "funder": "Funder", + "data_tags": "Tags", + }, } } dims_aggregations = { - "is_open_data": ["percent", "count_true"], - "is_open_code": ["percent", "count_true"], + "is_open_data": ["percent", "count_true", "count"], + "is_open_code": [ + "percent", + "count_true", + ], # "score": ["mean"], # "eigenfactor_score": ["mean"], } @@ -47,6 +57,7 @@ "percent_is_open_code": "Code Sharing (%)", "count_true_is_open_data": "Data Sharing", "count_true_is_open_code": "Code Sharing", + "count_is_open_data": "Total number of publications", "mean_score": "Mean Score", "mean_eigenfactor_score": "Mean Eigenfactor Score", } @@ -156,6 +167,16 @@ def __init__(self, datasets, **params): # DEBUG self.echarts_update_button.on_click(self.did_click_update_echart_plot) + def splitting_var_label(self, splitting_var): + return extraction_tools_params[self.extraction_tool]["labels"][splitting_var] + + def splitting_var_from_label(self, label): + return [ + k + for k, v in extraction_tools_params[self.extraction_tool]["labels"].items() + if v == label + ][0] + @pn.depends("extraction_tool", watch=True) def did_change_extraction_tool(self): print("DID_CHANGE_EXTRACTION_TOOL") @@ -180,7 +201,9 @@ def did_change_extraction_tool(self): new_extraction_tools_splitting_vars = extraction_tools_params[ self.extraction_tool ]["splitting_vars"] - self.param.splitting_var.objects = new_extraction_tools_splitting_vars + self.param.splitting_var.objects = [ + self.splitting_var_label(v) for v in new_extraction_tools_splitting_vars + ] # Update the filters ## filter_pubdate @@ -201,7 +224,7 @@ def did_change_extraction_tool(self): ## affiliation country countries_with_count = self.get_col_values_with_count( - "affiliation_country", lambda x: x is None + "affiliation_country", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" ) def country_sorter(c): @@ -225,7 +248,7 @@ def funder_sorter(c): ## Tags tags_with_count = self.get_col_values_with_count( - "data_tags", lambda x: x is None + "data_tags", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" ) def tags_sorter(c): @@ -258,8 +281,10 @@ def did_change_splitting_var(self): # already set the echarts pane as loading for a better UX self.echarts_pane.loading = True + splitting_var = self.splitting_var_from_label(self.splitting_var) + notif_msg = None - if self.splitting_var == "journal": + if splitting_var == "journal": # We want to show all journals, but pre-select only the top 10 selected_journals = list( self.raw_data.query("journal != 'None'") @@ -271,10 +296,11 @@ def did_change_splitting_var(self): else: selected_journals = self.param.filter_journal.objects - if self.splitting_var == "affiliation_country": + if splitting_var == "affiliation_country": # We want to show all countries, but pre-select only the top 10 countries_with_count = self.get_col_values_with_count( - "affiliation_country", lambda x: x is None + "affiliation_country", + lambda x: len(x) == 0 or len(x) == 1 and x[0] == "", ) # pre-filter the countries because there are a lot @@ -302,7 +328,7 @@ def did_change_splitting_var(self): else: selected_countries = self.param.filter_affiliation_country.objects - if self.splitting_var == "funder": + if splitting_var == "funder": # We want to show all funders, but pre-select only the top 10 funders_with_count = self.get_col_values_with_count( "funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == "" @@ -346,7 +372,7 @@ def did_change_splitting_var(self): trigger_rendering=self.trigger_rendering + 1, ) - if self.splitting_var == "None": + if splitting_var == "None": notif_msg = "No more splitting. Filters reset to default" if notif_msg is not None: @@ -411,7 +437,7 @@ def tags_filter(cell): groupers = ["year"] if self.splitting_var != "None": - groupers.append(self.splitting_var) + groupers.append(self.splitting_var_from_label(self.splitting_var)) result = filtered_df.groupby(groupers).agg(**aggretations).reset_index() @@ -465,26 +491,28 @@ def updated_echart_plot(self): series = [] legend_data = [] - if self.splitting_var == "affiliation_country": + splitting_var = self.splitting_var_from_label(self.splitting_var) + + if splitting_var == "affiliation_country": splitting_var_filter = self.filter_affiliation_country splitting_var_column = "affiliation_country" - splitting_var_query = lambda cell, selected_item: selected_item in cell + splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731 - elif self.splitting_var == "funder": + elif splitting_var == "funder": splitting_var_filter = self.filter_funder splitting_var_column = "funder" - splitting_var_query = lambda cell, selected_item: selected_item in cell + splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731 - elif self.splitting_var == "data_tags": + elif splitting_var == "data_tags": splitting_var_filter = self.filter_tags splitting_var_column = "data_tags" - splitting_var_query = lambda cell, selected_item: selected_item in cell + splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731 else: print("Defaulting to splitting var 'journal' ") splitting_var_filter = self.filter_journal splitting_var_column = "journal" - splitting_var_query = lambda cell, selected_item: cell == selected_item + splitting_var_query = lambda cell, selected_item: cell == selected_item # noqa: E731 last_year_values = {} for selected_item in sorted(splitting_var_filter): @@ -497,9 +525,10 @@ def updated_echart_plot(self): ] if len(sub_df) > 0: + aggregation = "mean" if "percent" in raw_metric else "sum" sub_df = ( sub_df.groupby("year") - .agg({raw_metric: "mean"}) # todo fix this + .agg({raw_metric: aggregation}) .reset_index() ) @@ -514,7 +543,7 @@ def updated_echart_plot(self): "name": selected_item, "type": "line", "data": sub_df[raw_metric].tolist(), - # Shows a label at the end of the line. + # Shows a label at the end of the plotted line. # Labels end up overlapping in some cases. # To fix this, we would need to change the offset of the label # with values calculated to avoid overlapping. @@ -559,6 +588,7 @@ def updated_echart_plot(self): "tooltip": { "show": True, "trigger": "axis", + "order": "valueDesc", # "formatter": f"""{self.splitting_var} : {{b0}}
# {{a0}} : {{c0}}
# {{a1}} : {{c1}} """, @@ -591,7 +621,9 @@ def updated_echart_plot(self): "fontFamily": "Roboto", "fontSize": "20", }, - "axisLabel": {"formatter": "{value}%"}, + "axisLabel": { + "formatter": "{value}%" if "percent" in raw_metric else "{value}" + }, }, "series": series, }