Skip to content

Commit

Permalink
feat: Restructure packages to streamline the addition of new standards
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Oct 21, 2024
1 parent c82c887 commit eca3d65
Show file tree
Hide file tree
Showing 7 changed files with 20,101 additions and 96 deletions.
27 changes: 14 additions & 13 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import numpy as np
import panel as pn
from IPython.display import display, Markdown
from bdikit.utils import get_gdc_data, get_gdc_metadata

from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best.matcher_factory import SchemaMatchers
Expand All @@ -27,6 +26,7 @@
from bdikit.value_matching.base import BaseValueMatcher, ValueMatch, ValueMatchingResult
from bdikit.value_matching.matcher_factory import ValueMatchers

from bdikit.standards.standard_factory import Standards
from bdikit.mapping_functions import (
ValueMapper,
FunctionValueMapper,
Expand All @@ -36,7 +36,6 @@

pn.extension("tabulator")

GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
DEFAULT_VALUE_MATCHING_METHOD = "tfidf"
DEFAULT_SCHEMA_MATCHING_METHOD = "coma"
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -92,10 +91,10 @@ def _load_table_for_standard(name: str) -> pd.DataFrame:
Load the table for the given standard data vocabulary. Currently, only the
GDC standard is supported.
"""
if name == "gdc":
return pd.read_csv(GDC_DATA_PATH)
else:
raise ValueError(f"The {name} standard is not supported")
standard = Standards.get_instance(name)
df = standard.get_dataframe_rep()

return df


def top_matches(
Expand Down Expand Up @@ -439,9 +438,10 @@ def _format_value_matching_input(
f"The source column '{source_column}' is not present in the source dataset."
)

if isinstance(target, str) and target == "gdc":
if isinstance(target, str):
column_names = mapping_df["target"].unique().tolist()
target_domain = get_gdc_data(column_names)
standard = Standards.get_instance(target)
target_domain = standard.get_column_values(column_names)
elif isinstance(target, pd.DataFrame):
target_domain = {
column_name: target[column_name].unique().tolist()
Expand Down Expand Up @@ -518,11 +518,12 @@ def preview_domain(
(if applicable).
"""

if isinstance(dataset, str) and dataset == "gdc":
gdc_metadata = get_gdc_metadata()
value_names = gdc_metadata[column]["value_names"]
value_descriptions = gdc_metadata[column]["value_descriptions"]
column_description = gdc_metadata[column]["description"]
if isinstance(dataset, str):
standard = Standards.get_instance(dataset)
column_metadata = standard.get_column_metadata([column])
value_names = column_metadata[column]["value_names"]
value_descriptions = column_metadata[column]["value_descriptions"]
column_description = column_metadata[column]["description"]
assert len(value_names) == len(value_descriptions)
elif isinstance(dataset, pd.DataFrame):
value_names = dataset[column].unique()
Expand Down
Loading

0 comments on commit eca3d65

Please sign in to comment.