feat: Load the necessary algorithms dynamically

VIDA-NYU · Nov 7, 2024 · c81514c · c81514c
1 parent c82c887
commit c81514c
Show file tree

Hide file tree

Showing 20 changed files with 116 additions and 89 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -12,11 +12,12 @@ Adding New Matching Methods
 
 Contributors can add new methods for schema and value matching by following these steps:
 
-1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).
+1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).
 
 2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).
 
-3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).
+3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your 
+module to ensure it can be accessed without errors.
 
 
 Code of Conduct

diff --git a/bdikit/api.py b/bdikit/api.py
@@ -74,7 +74,7 @@ def match_schema(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        matcher_instance = SchemaMatchers.get_instance(method, **method_args)
+        matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
     elif isinstance(method, BaseSchemaMatcher):
         matcher_instance = method
     else:
@@ -132,7 +132,7 @@ def top_matches(
     if isinstance(method, str):
         if method_args is None:
             method_args = {}
-        topk_matcher = TopkMatchers.get_instance(method, **method_args)
+        topk_matcher = TopkMatchers.get_matcher(method, **method_args)
     elif isinstance(method, BaseTopkSchemaMatcher):
         topk_matcher = method
     else:
@@ -343,7 +343,7 @@ def _match_values(
     target_domain, column_mapping_list = _format_value_matching_input(
         source, target, column_mapping
     )
-    value_matcher = ValueMatchers.get_instance(method, **method_args)
+    value_matcher = ValueMatchers.get_matcher(method, **method_args)
     mapping_results: List[ValueMatchingResult] = []
 
     for mapping in column_mapping_list:

diff --git a/bdikit/schema_matching/best/__init__.py b/bdikit/schema_matching/best/__init__.py
@@ -1,5 +0,0 @@
-from bdikit.schema_matching.best.algorithms.valentine import *
-from bdikit.schema_matching.best.algorithms.gpt import *
-from bdikit.schema_matching.best.algorithms.contrastivelearning import *
-from bdikit.schema_matching.best.algorithms.twophase import *
-from bdikit.schema_matching.best.algorithms.maxvalsim import *

diff --git a/bdikit/schema_matching/best/algorithms/__init__.py b/bdikit/schema_matching/best/algorithms/__init__.py
diff --git a/...ng/best/algorithms/contrastivelearning.py → ...hema_matching/best/contrastivelearning.py b/...ng/best/algorithms/contrastivelearning.py → ...hema_matching/best/contrastivelearning.py
diff --git a/...it/schema_matching/best/algorithms/gpt.py → bdikit/schema_matching/best/gpt.py b/...it/schema_matching/best/algorithms/gpt.py → bdikit/schema_matching/best/gpt.py
diff --git a/bdikit/schema_matching/best/matcher_factory.py b/bdikit/schema_matching/best/matcher_factory.py
@@ -1,45 +1,63 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.schema_matching.best.base import BaseSchemaMatcher
-from bdikit.schema_matching.best import (
-    SimFloodSchemaMatcher,
-    ComaSchemaMatcher,
-    CupidSchemaMatcher,
-    DistributionBasedSchemaMatcher,
-    JaccardSchemaMatcher,
-    GPTSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    MaxValSimSchemaMatcher,
-)
 
 
 class SchemaMatchers(Enum):
-    SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
-    COMA = ("coma", ComaSchemaMatcher)
-    CUPID = ("cupid", CupidSchemaMatcher)
-    DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
-    JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
-    GPT = ("gpt", GPTSchemaMatcher)
-    CT_LEARNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
-    TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
-    MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
+    SIMFLOOD = (
+        "similarity_flooding",
+        "bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher",
+    )
+    COMA = (
+        "coma",
+        "bdikit.schema_matching.best.valentine.ComaSchemaMatcher",
+    )
+    CUPID = (
+        "cupid",
+        "bdikit.schema_matching.best.valentine.CupidSchemaMatcher",
+    )
+    DISTRIBUTION_BASED = (
+        "distribution_based",
+        "bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher",
+    )
+    JACCARD_DISTANCE = (
+        "jaccard_distance",
+        "bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher",
+    )
+    GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher")
+    CT_LEARNING = (
+        "ct_learning",
+        "bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher",
+    )
+    TWO_PHASE = (
+        "two_phase",
+        "bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher",
+    )
+    MAX_VAL_SIM = (
+        "max_val_sim",
+        "bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher",
+    )
 
-    def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in SchemaMatchers}
-
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in SchemaMatchers}
diff --git a/...ema_matching/best/algorithms/maxvalsim.py → bdikit/schema_matching/best/maxvalsim.py b/...ema_matching/best/algorithms/maxvalsim.py → bdikit/schema_matching/best/maxvalsim.py
diff --git a/...hema_matching/best/algorithms/twophase.py → bdikit/schema_matching/best/twophase.py b/...hema_matching/best/algorithms/twophase.py → bdikit/schema_matching/best/twophase.py
diff --git a/...ema_matching/best/algorithms/valentine.py → bdikit/schema_matching/best/valentine.py b/...ema_matching/best/algorithms/valentine.py → bdikit/schema_matching/best/valentine.py
diff --git a/bdikit/schema_matching/topk/__init__.py b/bdikit/schema_matching/topk/__init__.py
@@ -1 +1 @@
-from bdikit.schema_matching.topk.algorithms.contrastivelearning import *
+
diff --git a/...ng/topk/algorithms/contrastivelearning.py → ...hema_matching/topk/contrastivelearning.py b/...ng/topk/algorithms/contrastivelearning.py → ...hema_matching/topk/contrastivelearning.py
diff --git a/bdikit/schema_matching/topk/matcher_factory.py b/bdikit/schema_matching/topk/matcher_factory.py
@@ -1,26 +1,34 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
-from bdikit.schema_matching.topk import CLTopkSchemaMatcher
 
 
 class TopkMatchers(Enum):
-    CT_LEARNING = ("ct_learning", CLTopkSchemaMatcher)
+    CT_LEARNING = (
+        "ct_learning",
+        "bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher",
+    )
 
-    def __init__(self, method_name: str, method_class: Type[BaseTopkSchemaMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseTopkSchemaMatcher:
-        methods = {method.method_name: method.method_class for method in TopkMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in TopkMatchers}
diff --git a/bdikit/value_matching/__init__.py b/bdikit/value_matching/__init__.py
@@ -1,2 +0,0 @@
-from bdikit.value_matching.algorithms.polyfuzz import *
-from bdikit.value_matching.algorithms.gpt import *

diff --git a/bdikit/value_matching/algorithms/__init__.py b/bdikit/value_matching/algorithms/__init__.py
diff --git a/bdikit/value_matching/algorithms/gpt.py → bdikit/value_matching/gpt.py b/bdikit/value_matching/algorithms/gpt.py → bdikit/value_matching/gpt.py
diff --git a/bdikit/value_matching/matcher_factory.py b/bdikit/value_matching/matcher_factory.py
@@ -1,36 +1,44 @@
+import importlib
 from enum import Enum
-from typing import Mapping, Any, Type
+from typing import Mapping, Any
 from bdikit.value_matching.base import BaseValueMatcher
-from bdikit.value_matching import (
-    GPTValueMatcher,
-    TFIDFValueMatcher,
-    EditDistanceValueMatcher,
-    EmbeddingValueMatcher,
-    FastTextValueMatcher,
-)
 
 
 class ValueMatchers(Enum):
-    TFIDF = ("tfidf", TFIDFValueMatcher)
-    EDIT = ("edit_distance", EditDistanceValueMatcher)
-    EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
-    FASTTEXT = ("fasttext", FastTextValueMatcher)
-    GPT = ("gpt", GPTValueMatcher)
+    TFIDF = ("tfidf", "bdikit.value_matching.polyfuzz.TFIDFValueMatcher")
+    EDIT = (
+        "edit_distance",
+        "bdikit.value_matching.polyfuzz.EditDistanceValueMatcher",
+    )
+    EMBEDDINGS = (
+        "embedding",
+        "bdikit.value_matching.polyfuzz.EmbeddingValueMatcher",
+    )
+    FASTTEXT = (
+        "fasttext",
+        "bdikit.value_matching.polyfuzz.FastTextValueMatcher",
+    )
+    GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")
 
-    def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
-        self.method_name = method_name
-        self.method_class = method_class
+    def __init__(self, matcher_name: str, matcher_path: str):
+        self.matcher_name = matcher_name
+        self.matcher_path = matcher_path
 
     @staticmethod
-    def get_instance(
-        method_name: str, **method_kwargs: Mapping[str, Any]
+    def get_matcher(
+        matcher_name: str, **matcher_kwargs: Mapping[str, Any]
     ) -> BaseValueMatcher:
-        methods = {method.method_name: method.method_class for method in ValueMatchers}
-        try:
-            return methods[method_name](**method_kwargs)
-        except KeyError:
-            names = ", ".join(list(methods.keys()))
+        if matcher_name not in matchers:
+            names = ", ".join(list(matchers.keys()))
             raise ValueError(
-                f"The {method_name} algorithm is not supported. "
+                f"The {matcher_name} algorithm is not supported. "
                 f"Supported algorithms are: {names}"
             )
+        # Load the class dynamically
+        module_path, class_name = matchers[matcher_name].rsplit(".", 1)
+        module = importlib.import_module(module_path)
+
+        return getattr(module, class_name)(**matcher_kwargs)
+
+
+matchers = {method.matcher_name: method.matcher_path for method in ValueMatchers}
diff --git a/bdikit/value_matching/algorithms/polyfuzz.py → bdikit/value_matching/polyfuzz.py b/bdikit/value_matching/algorithms/polyfuzz.py → bdikit/value_matching/polyfuzz.py
diff --git a/tests/test_schema_matching.py b/tests/test_schema_matching.py
@@ -1,13 +1,13 @@
 import pandas as pd
-from bdikit.schema_matching.best import (
+from bdikit.schema_matching.best.valentine import (
     SimFloodSchemaMatcher,
     JaccardSchemaMatcher,
     DistributionBasedSchemaMatcher,
     ComaSchemaMatcher,
     CupidSchemaMatcher,
-    TwoPhaseSchemaMatcher,
-    ContrastiveLearningSchemaMatcher,
 )
+from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher
+from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher
 
 
 def test_basic_column_mapping_algorithms():

diff --git a/tests/test_value_matching.py b/tests/test_value_matching.py
@@ -1,6 +1,6 @@
 import unittest
 import pandas as pd
-from bdikit.value_matching import (
+from bdikit.value_matching.algorithms.polyfuzz import (
     TFIDFValueMatcher,
     EditDistanceValueMatcher,
 )