diff --git a/.gitattributes b/.gitattributes
index 6e6a3da8..6ccbee3d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,4 +6,3 @@
# Enable binary delta compression for PNG files
*.png -delta
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 605efde9..9248b113 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,9 +1,4 @@
repos:
- - repo: https://github.com/psf/black
- rev: 24.4.2
- hooks:
- - id: black
-
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.5.0
diff --git a/pyproject.toml b/pyproject.toml
index 1808d06b..f13b6a4d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ keywords = [
dynamic = ["version"]
dependencies = [
"dill",
+ "colorcet",
"pandas",
"pyarrow",
"pydantic",
diff --git a/web/dashboard/app.py b/web/dashboard/app.py
index cd8af205..3a828f33 100644
--- a/web/dashboard/app.py
+++ b/web/dashboard/app.py
@@ -41,9 +41,14 @@ def load_data():
# necessary conversion to tuples, which is hashable type
# needed for grouping
- for col in ["affiliation_country", "funder", "data_tags"]:
+ for col in ["funder", "data_tags"]:
raw_data[col] = raw_data[col].apply(lambda x: tuple(x))
+ # convert to tuple, remove duplicates and remove leading and trailing spaces in countries names
+ raw_data["affiliation_country"] = raw_data["affiliation_country"].apply(
+ lambda x: tuple(set([v.strip() for v in x]))
+ )
+
return raw_data
diff --git a/web/dashboard/main_dashboard.py b/web/dashboard/main_dashboard.py
index 749eb6ef..c98beab8 100644
--- a/web/dashboard/main_dashboard.py
+++ b/web/dashboard/main_dashboard.py
@@ -32,12 +32,22 @@
"funder",
"data_tags",
],
+ "labels": {
+ "None": "None",
+ "journal": "Journal",
+ "affiliation_country": "Country",
+ "funder": "Funder",
+ "data_tags": "Tags",
+ },
}
}
dims_aggregations = {
- "is_open_data": ["percent", "count_true"],
- "is_open_code": ["percent", "count_true"],
+ "is_open_data": ["percent", "count_true", "count"],
+ "is_open_code": [
+ "percent",
+ "count_true",
+ ],
# "score": ["mean"],
# "eigenfactor_score": ["mean"],
}
@@ -47,6 +57,7 @@
"percent_is_open_code": "Code Sharing (%)",
"count_true_is_open_data": "Data Sharing",
"count_true_is_open_code": "Code Sharing",
+ "count_is_open_data": "Total number of publications",
"mean_score": "Mean Score",
"mean_eigenfactor_score": "Mean Eigenfactor Score",
}
@@ -156,6 +167,16 @@ def __init__(self, datasets, **params):
# DEBUG
self.echarts_update_button.on_click(self.did_click_update_echart_plot)
+ def splitting_var_label(self, splitting_var):
+ return extraction_tools_params[self.extraction_tool]["labels"][splitting_var]
+
+ def splitting_var_from_label(self, label):
+ return [
+ k
+ for k, v in extraction_tools_params[self.extraction_tool]["labels"].items()
+ if v == label
+ ][0]
+
@pn.depends("extraction_tool", watch=True)
def did_change_extraction_tool(self):
print("DID_CHANGE_EXTRACTION_TOOL")
@@ -180,7 +201,9 @@ def did_change_extraction_tool(self):
new_extraction_tools_splitting_vars = extraction_tools_params[
self.extraction_tool
]["splitting_vars"]
- self.param.splitting_var.objects = new_extraction_tools_splitting_vars
+ self.param.splitting_var.objects = [
+ self.splitting_var_label(v) for v in new_extraction_tools_splitting_vars
+ ]
# Update the filters
## filter_pubdate
@@ -201,7 +224,7 @@ def did_change_extraction_tool(self):
## affiliation country
countries_with_count = self.get_col_values_with_count(
- "affiliation_country", lambda x: x is None
+ "affiliation_country", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
)
def country_sorter(c):
@@ -225,7 +248,7 @@ def funder_sorter(c):
## Tags
tags_with_count = self.get_col_values_with_count(
- "data_tags", lambda x: x is None
+ "data_tags", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
)
def tags_sorter(c):
@@ -258,8 +281,10 @@ def did_change_splitting_var(self):
# already set the echarts pane as loading for a better UX
self.echarts_pane.loading = True
+ splitting_var = self.splitting_var_from_label(self.splitting_var)
+
notif_msg = None
- if self.splitting_var == "journal":
+ if splitting_var == "journal":
# We want to show all journals, but pre-select only the top 10
selected_journals = list(
self.raw_data.query("journal != 'None'")
@@ -271,10 +296,11 @@ def did_change_splitting_var(self):
else:
selected_journals = self.param.filter_journal.objects
- if self.splitting_var == "affiliation_country":
+ if splitting_var == "affiliation_country":
# We want to show all countries, but pre-select only the top 10
countries_with_count = self.get_col_values_with_count(
- "affiliation_country", lambda x: x is None
+ "affiliation_country",
+ lambda x: len(x) == 0 or len(x) == 1 and x[0] == "",
)
# pre-filter the countries because there are a lot
@@ -302,7 +328,7 @@ def did_change_splitting_var(self):
else:
selected_countries = self.param.filter_affiliation_country.objects
- if self.splitting_var == "funder":
+ if splitting_var == "funder":
# We want to show all funders, but pre-select only the top 10
funders_with_count = self.get_col_values_with_count(
"funder", lambda x: len(x) == 0 or len(x) == 1 and x[0] == ""
@@ -346,7 +372,7 @@ def did_change_splitting_var(self):
trigger_rendering=self.trigger_rendering + 1,
)
- if self.splitting_var == "None":
+ if splitting_var == "None":
notif_msg = "No more splitting. Filters reset to default"
if notif_msg is not None:
@@ -411,7 +437,7 @@ def tags_filter(cell):
groupers = ["year"]
if self.splitting_var != "None":
- groupers.append(self.splitting_var)
+ groupers.append(self.splitting_var_from_label(self.splitting_var))
result = filtered_df.groupby(groupers).agg(**aggretations).reset_index()
@@ -465,26 +491,28 @@ def updated_echart_plot(self):
series = []
legend_data = []
- if self.splitting_var == "affiliation_country":
+ splitting_var = self.splitting_var_from_label(self.splitting_var)
+
+ if splitting_var == "affiliation_country":
splitting_var_filter = self.filter_affiliation_country
splitting_var_column = "affiliation_country"
- splitting_var_query = lambda cell, selected_item: selected_item in cell
+ splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731
- elif self.splitting_var == "funder":
+ elif splitting_var == "funder":
splitting_var_filter = self.filter_funder
splitting_var_column = "funder"
- splitting_var_query = lambda cell, selected_item: selected_item in cell
+ splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731
- elif self.splitting_var == "data_tags":
+ elif splitting_var == "data_tags":
splitting_var_filter = self.filter_tags
splitting_var_column = "data_tags"
- splitting_var_query = lambda cell, selected_item: selected_item in cell
+ splitting_var_query = lambda cell, selected_item: selected_item in cell # noqa: E731
else:
print("Defaulting to splitting var 'journal' ")
splitting_var_filter = self.filter_journal
splitting_var_column = "journal"
- splitting_var_query = lambda cell, selected_item: cell == selected_item
+ splitting_var_query = lambda cell, selected_item: cell == selected_item # noqa: E731
last_year_values = {}
for selected_item in sorted(splitting_var_filter):
@@ -497,9 +525,10 @@ def updated_echart_plot(self):
]
if len(sub_df) > 0:
+ aggregation = "mean" if "percent" in raw_metric else "sum"
sub_df = (
sub_df.groupby("year")
- .agg({raw_metric: "mean"}) # todo fix this
+ .agg({raw_metric: aggregation})
.reset_index()
)
@@ -514,7 +543,7 @@ def updated_echart_plot(self):
"name": selected_item,
"type": "line",
"data": sub_df[raw_metric].tolist(),
- # Shows a label at the end of the line.
+ # Shows a label at the end of the plotted line.
# Labels end up overlapping in some cases.
# To fix this, we would need to change the offset of the label
# with values calculated to avoid overlapping.
@@ -559,6 +588,7 @@ def updated_echart_plot(self):
"tooltip": {
"show": True,
"trigger": "axis",
+ "order": "valueDesc",
# "formatter": f"""{self.splitting_var} : {{b0}}
# {{a0}} : {{c0}}
# {{a1}} : {{c1}} """,
@@ -591,7 +621,9 @@ def updated_echart_plot(self):
"fontFamily": "Roboto",
"fontSize": "20",
},
- "axisLabel": {"formatter": "{value}%"},
+ "axisLabel": {
+ "formatter": "{value}%" if "percent" in raw_metric else "{value}"
+ },
},
"series": series,
}