diff --git a/web_api/dashboard/app.py b/web_api/dashboard/app.py new file mode 100644 index 00000000..1e2e7354 --- /dev/null +++ b/web_api/dashboard/app.py @@ -0,0 +1,117 @@ +import os + +import pandas as pd +import panel as pn +from main_dashboard import MainDashboard +from odmantic import SyncEngine +from pymongo import MongoClient +from ui import get_template + +from osm import schemas + + +def flatten_dict(d): + """ + Recursively flattens a nested dictionary without prepending parent keys. + + :param d: Dictionary to flatten. + :return: Flattened dictionary. + """ + items = [] + for k, v in d.items(): + if isinstance(v, dict): + # If the value is a dictionary, flatten it without the parent key + items.extend(flatten_dict(v).items()) + else: + items.append((k, v)) + return dict(items) + + +def load_data(): + if "LOCAL_DATA_PATH" in os.environ: + return pd.read_feather(os.environ["LOCAL_DATA_PATH"]) + client = MongoClient(os.environ["MONGODB_URI"]) + engine = SyncEngine(client=client, database="osm") + matches = ( + engine.get_collection(schemas.Invocation) + .aggregate( + [ + { + "$match": { + "osm_version": {"$eq": "0.0.1"}, + # "work.pmid": {"$regex":r"^2"}, + "metrics.year": {"$gt": 2000}, + # "metrics.is_data_pred": {"$eq": True}, + }, + }, + { + "$project": { + # "osm_version": True, + # "user_comment": True, + # "client.compute_context_id": True, + "work.user_defined_id": True, + "metrics.year": True, + "metrics.is_code_pred": True, + "metrics.is_data_pred": True, + "metrics.affiliation_country": True, + "metrics.score": True, + "metrics.eigenfactor_score": True, + "metrics.fund_pmc_anysource": True, + "metrics.fund_pmc_institute": True, + "metrics.fund_pmc_source": True, + "metrics.journal": True, + }, + }, + ] + ) + .__iter__() + ) + return pd.DataFrame(flatten_dict(match) for match in matches) + + +def dashboard_page(): + template = get_template() + + dashboard = MainDashboard(pn.state.cache["data"]) + + template.main.append(dashboard.get_dashboard) + template.sidebar.append(dashboard.get_sidebar) + + return template + + +def on_load(): + """ + Add resource intensive things that you only want to run once. + """ + pn.config.browser_info = True + pn.config.notifications = True + raw_data = load_data() + raw_data = raw_data[raw_data != 999999] + + # Harcoded for now, will be added to the raw data later + raw_data["metrics"] = "RTransparent" + + pn.state.cache["data"] = raw_data + + +if __name__ == "__main__": + # Runs all the things necessary before the server actually starts. + pn.state.onload(on_load) + print("starting dashboard!") + pn.serve( + {"/": dashboard_page}, + address="0.0.0.0", + port=8501, + start=True, + location=True, + show=False, + keep_alive=30 * 1000, # 30s + autoreload=True, + admin=True, + profiler="pyinstrument", + allow_websocket_origin=[ + "localhost:8501", + "osm.pythonaisolutions.com", + ], + ) diff --git a/web_api/dashboard/main_dashboard.py b/web_api/dashboard/main_dashboard.py new file mode 100644 index 00000000..a5044088 --- /dev/null +++ b/web_api/dashboard/main_dashboard.py @@ -0,0 +1,187 @@ +import holoviews as hv +import pandas as pd +import panel as pn +import param + +pn.extension() + + +pd.options.display.max_columns = None + +# filters = { +# "journal" : "category", +# "metrics" : "select", +# } + + +groups = {"year": "int"} + + +dims_aggregations = { + "is_data_pred": ["percent", "count_true", "count"], + "is_code_pred": ["percent", "count_true"], + "score": ["mean"], + "eigenfactor_score": ["mean"], +} + + +aggregation_formulas = { + "percent": lambda x: x.mean() * 100, + "count_true": lambda x: (x == True).sum(), # noqa + "count": "count", + "mean": "mean", +} + + +class MainDashboard(param.Parameterized): + """ + Main dashboard for the application. + """ + + select_metrics = param.Selector( + default="RTransparent", objects=["RTransparent"], label="" + ) + + grouping_var = param.Selector( + default="year", objects=["year", "fund_pmc_institute"], label="" + ) + + filter_journal = param.Selector( + default="All journals (including empty)", + objects=[ + "All journals (including empty)", + "All journals (excluding empty values)", + "Only selected journals", + ], + label="Journal", + ) + filter_selected_journals = param.ListSelector(default=[], objects=[], label="") + + def __init__(self, raw_data, **params): + super().__init__(**params) + + self.raw_data = raw_data + + self.param.filter_selected_journals.objects = self.raw_data.journal.unique() + # As default, takes the journals with the biggest number of occurences + self.filter_selected_journals = list( + self.raw_data.journal.value_counts().iloc[:10].index + ) + + def filtered_grouped_data(self): + filters = [] + + if self.filter_journal == "All journals (excluding empty values)": + filters.append(("journal.notnull()")) + elif self.filter_journal == "Only selected journals": + filters.append(f"journal in {self.filter_selected_journals}") + + filtered_df = self.raw_data.query(*filters) if filters else self.raw_data + + aggretations = {} + for field, aggs in dims_aggregations.items(): + for agg in aggs: + aggretations[f"{agg}_{field}"] = (field, aggregation_formulas[agg]) + + result = ( + filtered_df.groupby(self.grouping_var).agg(**aggretations).reset_index() + ) + + return result + + @pn.depends("select_metrics", "filter_journal") + def get_sidebar(self): + items = [ + pn.pane.Markdown("## Filters"), + pn.pane.Markdown("### Metrics extraction tool"), + pn.widgets.Select.from_param(self.param.select_metrics), + pn.layout.Divider(), + pn.pane.Markdown("### Grouping"), + pn.widgets.Select.from_param(self.param.grouping_var), + pn.layout.Divider(), + pn.pane.Markdown("### Filters"), + pn.widgets.Select.from_param(self.param.filter_journal), + ] + + if self.filter_journal == "Only selected journals": + items.append( + pn.widgets.MultiChoice.from_param( + self.param.filter_selected_journals, max_items=10 + ) + ) + + sidebar = pn.Column(*items) + + return sidebar + + @pn.depends( + "select_metrics", "filter_journal", "filter_selected_journals", "grouping_var" + ) + def get_dashboard(self): + df = self.filtered_grouped_data() + + # Create charts + fig_data_curve = hv.Curve( + df, + kdims=[self.grouping_var], + vdims=[ + "percent_is_data_pred", + ], + ).opts(color="red") + + fig_code_curve = hv.Curve( + df, + kdims=[self.grouping_var], + vdims=[ + "percent_is_code_pred", + ], + ).opts(color="lightblue") + + fig_data_points = hv.Points( + df, + kdims=[self.grouping_var, "percent_is_data_pred"], + ).opts( + tools=["hover"], + color="red", + size=5, + hover_tooltips=[ + (self.grouping_var, f"@{self.grouping_var}"), + ("% is_data_prep", "@percent_is_data_pred"), + ("Total is_data_prep", "@count_true_is_data_pred"), + ("nbr_publications", "@count_is_data_pred"), + ], + ) + + fig_code_points = hv.Points( + df, + kdims=[self.grouping_var, "percent_is_code_pred"], + ).opts( + tools=["hover"], + color="lightblue", + size=5, + hover_tooltips=[ + (self.grouping_var, f"@{self.grouping_var}"), + ("% is_code_prep", "@percent_is_code_pred"), + ("Total is_code_prep", "@count_true_is_code_pred"), + ("nbr_publications", "@count_is_data_pred"), + ], + ) + + plot = ( + fig_code_curve * fig_data_curve * fig_data_points * fig_code_points + ).opts( + title="", + xlabel=self.grouping_var, + ylabel="Percentage", + width=800, + height=400, + legend_position="top_left", + ) + + # Layout the dashboard + dashboard = pn.Column( + "# Data and code transparency", + pn.Column(plot, sizing_mode="stretch_width"), + ) + + return dashboard diff --git a/web_api/dashboard/ui.py b/web_api/dashboard/ui.py new file mode 100644 index 00000000..6f27805f --- /dev/null +++ b/web_api/dashboard/ui.py @@ -0,0 +1,71 @@ +import panel as pn + + +def connection_monitor(): + connection_monitor = pn.pane.HTML( + """ + + + """ + ) + + return connection_monitor + + +def get_template(): + """ + Returns a Panel template with the given title, + with its menu and other header items. + """ + + template = pn.template.FastListTemplate( + site="NIH", + title="OpenSciMetrics", + favicon="https://www.nih.gov/favicon.ico", + sidebar=[], + ) + + template.header.append( + connection_monitor(), + ) + + return template