Skip to content

Commit

Permalink
feat: Load the necessary algorithms dynamically
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Nov 7, 2024
1 parent c82c887 commit c81514c
Show file tree
Hide file tree
Showing 20 changed files with 116 additions and 89 deletions.
5 changes: 3 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ Adding New Matching Methods

Contributors can add new methods for schema and value matching by following these steps:

1. Create a Python module inside the `algorithms` folder (e.g., `bdikit/value_matching/algorithms`).
1. Create a Python module inside the "task folder" folder (e.g., `bdikit/value_matching`).

2. Define a class in the module that implements either `BaseValueMatcher` (for value matching) or `BaseSchemaMatcher` (for schema matching).

3. Instantiate an object of your class in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Ensure your module is properly imported in the `__init__.py` file (e.g.,` bdikit/value_matching/__init__.py`).
3. Add a new entry in `matcher_factory.py` (e.g., `bdikit/value_matching/matcher_factory.py`). Make sure to add the correct import path for your
module to ensure it can be accessed without errors.


Code of Conduct
Expand Down
6 changes: 3 additions & 3 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def match_schema(
if isinstance(method, str):
if method_args is None:
method_args = {}
matcher_instance = SchemaMatchers.get_instance(method, **method_args)
matcher_instance = SchemaMatchers.get_matcher(method, **method_args)
elif isinstance(method, BaseSchemaMatcher):
matcher_instance = method
else:
Expand Down Expand Up @@ -132,7 +132,7 @@ def top_matches(
if isinstance(method, str):
if method_args is None:
method_args = {}
topk_matcher = TopkMatchers.get_instance(method, **method_args)
topk_matcher = TopkMatchers.get_matcher(method, **method_args)
elif isinstance(method, BaseTopkSchemaMatcher):
topk_matcher = method
else:
Expand Down Expand Up @@ -343,7 +343,7 @@ def _match_values(
target_domain, column_mapping_list = _format_value_matching_input(
source, target, column_mapping
)
value_matcher = ValueMatchers.get_instance(method, **method_args)
value_matcher = ValueMatchers.get_matcher(method, **method_args)
mapping_results: List[ValueMatchingResult] = []

for mapping in column_mapping_list:
Expand Down
5 changes: 0 additions & 5 deletions bdikit/schema_matching/best/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
from bdikit.schema_matching.best.algorithms.valentine import *
from bdikit.schema_matching.best.algorithms.gpt import *
from bdikit.schema_matching.best.algorithms.contrastivelearning import *
from bdikit.schema_matching.best.algorithms.twophase import *
from bdikit.schema_matching.best.algorithms.maxvalsim import *
1 change: 0 additions & 1 deletion bdikit/schema_matching/best/algorithms/__init__.py

This file was deleted.

File renamed without changes.
84 changes: 51 additions & 33 deletions bdikit/schema_matching/best/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,63 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.schema_matching.best.base import BaseSchemaMatcher
from bdikit.schema_matching.best import (
SimFloodSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
DistributionBasedSchemaMatcher,
JaccardSchemaMatcher,
GPTSchemaMatcher,
ContrastiveLearningSchemaMatcher,
TwoPhaseSchemaMatcher,
MaxValSimSchemaMatcher,
)


class SchemaMatchers(Enum):
SIMFLOOD = ("similarity_flooding", SimFloodSchemaMatcher)
COMA = ("coma", ComaSchemaMatcher)
CUPID = ("cupid", CupidSchemaMatcher)
DISTRIBUTION_BASED = ("distribution_based", DistributionBasedSchemaMatcher)
JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
GPT = ("gpt", GPTSchemaMatcher)
CT_LEARNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)
MAX_VAL_SIM = ("max_val_sim", MaxValSimSchemaMatcher)
SIMFLOOD = (
"similarity_flooding",
"bdikit.schema_matching.best.valentine.SimFloodSchemaMatcher",
)
COMA = (
"coma",
"bdikit.schema_matching.best.valentine.ComaSchemaMatcher",
)
CUPID = (
"cupid",
"bdikit.schema_matching.best.valentine.CupidSchemaMatcher",
)
DISTRIBUTION_BASED = (
"distribution_based",
"bdikit.schema_matching.best.valentine.DistributionBasedSchemaMatcher",
)
JACCARD_DISTANCE = (
"jaccard_distance",
"bdikit.schema_matching.best.valentine.JaccardDistanceSchemaMatcher",
)
GPT = ("gpt", "bdikit.schema_matching.best.gpt.GPTSchemaMatcher")
CT_LEARNING = (
"ct_learning",
"bdikit.schema_matching.best.contrastivelearning.ContrastiveLearningSchemaMatcher",
)
TWO_PHASE = (
"two_phase",
"bdikit.schema_matching.best.twophase.TwoPhaseSchemaMatcher",
)
MAX_VAL_SIM = (
"max_val_sim",
"bdikit.schema_matching.best.maxvalsim.MaxValSimSchemaMatcher",
)

def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseSchemaMatcher:
methods = {method.method_name: method.method_class for method in SchemaMatchers}

try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in SchemaMatchers}
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion bdikit/schema_matching/topk/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from bdikit.schema_matching.topk.algorithms.contrastivelearning import *

36 changes: 22 additions & 14 deletions bdikit/schema_matching/topk/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.schema_matching.topk.base import BaseTopkSchemaMatcher
from bdikit.schema_matching.topk import CLTopkSchemaMatcher


class TopkMatchers(Enum):
CT_LEARNING = ("ct_learning", CLTopkSchemaMatcher)
CT_LEARNING = (
"ct_learning",
"bdikit.schema_matching.topk.contrastivelearning.CLTopkSchemaMatcher",
)

def __init__(self, method_name: str, method_class: Type[BaseTopkSchemaMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseTopkSchemaMatcher:
methods = {method.method_name: method.method_class for method in TopkMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in TopkMatchers}
2 changes: 0 additions & 2 deletions bdikit/value_matching/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from bdikit.value_matching.algorithms.polyfuzz import *
from bdikit.value_matching.algorithms.gpt import *
Empty file.
File renamed without changes.
56 changes: 32 additions & 24 deletions bdikit/value_matching/matcher_factory.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,44 @@
import importlib
from enum import Enum
from typing import Mapping, Any, Type
from typing import Mapping, Any
from bdikit.value_matching.base import BaseValueMatcher
from bdikit.value_matching import (
GPTValueMatcher,
TFIDFValueMatcher,
EditDistanceValueMatcher,
EmbeddingValueMatcher,
FastTextValueMatcher,
)


class ValueMatchers(Enum):
TFIDF = ("tfidf", TFIDFValueMatcher)
EDIT = ("edit_distance", EditDistanceValueMatcher)
EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
FASTTEXT = ("fasttext", FastTextValueMatcher)
GPT = ("gpt", GPTValueMatcher)
TFIDF = ("tfidf", "bdikit.value_matching.polyfuzz.TFIDFValueMatcher")
EDIT = (
"edit_distance",
"bdikit.value_matching.polyfuzz.EditDistanceValueMatcher",
)
EMBEDDINGS = (
"embedding",
"bdikit.value_matching.polyfuzz.EmbeddingValueMatcher",
)
FASTTEXT = (
"fasttext",
"bdikit.value_matching.polyfuzz.FastTextValueMatcher",
)
GPT = ("gpt", "bdikit.value_matching.gpt.GPTValueMatcher")

def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
self.method_name = method_name
self.method_class = method_class
def __init__(self, matcher_name: str, matcher_path: str):
self.matcher_name = matcher_name
self.matcher_path = matcher_path

@staticmethod
def get_instance(
method_name: str, **method_kwargs: Mapping[str, Any]
def get_matcher(
matcher_name: str, **matcher_kwargs: Mapping[str, Any]
) -> BaseValueMatcher:
methods = {method.method_name: method.method_class for method in ValueMatchers}
try:
return methods[method_name](**method_kwargs)
except KeyError:
names = ", ".join(list(methods.keys()))
if matcher_name not in matchers:
names = ", ".join(list(matchers.keys()))
raise ValueError(
f"The {method_name} algorithm is not supported. "
f"The {matcher_name} algorithm is not supported. "
f"Supported algorithms are: {names}"
)
# Load the class dynamically
module_path, class_name = matchers[matcher_name].rsplit(".", 1)
module = importlib.import_module(module_path)

return getattr(module, class_name)(**matcher_kwargs)


matchers = {method.matcher_name: method.matcher_path for method in ValueMatchers}
File renamed without changes.
6 changes: 3 additions & 3 deletions tests/test_schema_matching.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd
from bdikit.schema_matching.best import (
from bdikit.schema_matching.best.valentine import (
SimFloodSchemaMatcher,
JaccardSchemaMatcher,
DistributionBasedSchemaMatcher,
ComaSchemaMatcher,
CupidSchemaMatcher,
TwoPhaseSchemaMatcher,
ContrastiveLearningSchemaMatcher,
)
from bdikit.schema_matching.best.twophase import TwoPhaseSchemaMatcher
from bdikit.schema_matching.best.contrastivelearning import ContrastiveLearningSchemaMatcher


def test_basic_column_mapping_algorithms():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_value_matching.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest
import pandas as pd
from bdikit.value_matching import (
from bdikit.value_matching.algorithms.polyfuzz import (
TFIDFValueMatcher,
EditDistanceValueMatcher,
)
Expand Down

0 comments on commit c81514c

Please sign in to comment.