diff --git a/assets/linkage/dibbs_basic_algorithm.json b/assets/linkage/dibbs_basic_algorithm.json index 8e5f7d7..1f3f8ab 100644 --- a/assets/linkage/dibbs_basic_algorithm.json +++ b/assets/linkage/dibbs_basic_algorithm.json @@ -2,8 +2,8 @@ "algorithm": [ { "funcs": { - "first_name": "feature_match_fuzzy_string", - "last_name": "feature_match_exact" + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact" }, "blocks": [ { @@ -17,7 +17,7 @@ "value": "sex" } ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { @@ -32,8 +32,8 @@ }, { "funcs": { - "address": "feature_match_fuzzy_string", - "birthdate": "feature_match_exact" + "address": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_exact" }, "blocks": [ { @@ -51,7 +51,7 @@ "value": "sex" } ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { diff --git a/assets/linkage/dibbs_enhanced_algorithm.json b/assets/linkage/dibbs_enhanced_algorithm.json index fa9f499..f687cab 100644 --- a/assets/linkage/dibbs_enhanced_algorithm.json +++ b/assets/linkage/dibbs_enhanced_algorithm.json @@ -2,8 +2,8 @@ "algorithm": [ { "funcs": { - "first_name": "feature_match_log_odds_fuzzy_compare", - "last_name": "feature_match_log_odds_fuzzy_compare" + "first_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "last_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare" }, "blocks": [ { @@ -17,7 +17,7 @@ "value": "sex" } ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", @@ -45,8 +45,8 @@ }, { "funcs": { - "address": "feature_match_log_odds_fuzzy_compare", - "birthdate": "feature_match_log_odds_fuzzy_compare" + "address": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare" }, "blocks": [ { @@ -64,7 +64,7 @@ "value": "sex" } ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", diff --git a/src/recordlinker/linkage/algorithms.py b/src/recordlinker/linkage/algorithms.py index 57cdf0c..21fa08a 100644 --- a/src/recordlinker/linkage/algorithms.py +++ b/src/recordlinker/linkage/algorithms.py @@ -25,22 +25,22 @@ DIBBS_BASIC = [ { "funcs": { - "first_name": "feature_match_fuzzy_string", - "last_name": "feature_match_exact", + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact", }, "blocks": [ {"value": "birthdate"}, {"value": "mrn", "transformation": "last4"}, {"value": "sex"}, ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": {"thresholds": FUZZY_THRESHOLDS}, }, { "funcs": { - "address": "feature_match_fuzzy_string", - "birthdate": "feature_match_exact", + "address": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_exact", }, "blocks": [ {"value": "zip"}, @@ -48,7 +48,7 @@ {"value": "last_name", "transformation": "first4"}, {"value": "sex"}, ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": {"thresholds": FUZZY_THRESHOLDS}, }, @@ -57,15 +57,15 @@ DIBBS_ENHANCED = [ { "funcs": { - "first_name": "feature_match_log_odds_fuzzy_compare", - "last_name": "feature_match_log_odds_fuzzy_compare", + "first_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "last_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", }, "blocks": [ {"value": "birthdate"}, {"value": "mrn", "transformation": "last4"}, {"value": "sex"}, ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", @@ -76,8 +76,8 @@ }, { "funcs": { - "address": "feature_match_log_odds_fuzzy_compare", - "birthdate": "feature_match_log_odds_fuzzy_compare", + "address": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", }, "blocks": [ {"value": "zip"}, @@ -85,7 +85,7 @@ {"value": "last_name", "transformation": "first4"}, {"value": "sex"}, ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", diff --git a/src/recordlinker/linkage/link.py b/src/recordlinker/linkage/link.py index 189487d..de2f6de 100644 --- a/src/recordlinker/linkage/link.py +++ b/src/recordlinker/linkage/link.py @@ -1,20 +1,15 @@ -import copy import datetime import hashlib import json import logging import pathlib -from typing import Callable -from typing import List from typing import Union from pydantic import Field +from recordlinker.linkage import utils from recordlinker.linkage.mpi import BaseMPIConnectorClient from recordlinker.linkage.mpi import DIBBsMPIConnectorClient -from recordlinker.linkage.utils import compare_strings -from recordlinker.linkage.utils import datetime_to_str -from recordlinker.linkage.utils import extract_value_with_resource_path LINKING_FIELDS_TO_FHIRPATHS = { "first_name": "Patient.name.given", @@ -29,7 +24,7 @@ } -def compile_match_lists(match_lists: List[dict], cluster_mode: bool = False): +def compile_match_lists(match_lists: list[dict], cluster_mode: bool = False): """ Turns a list of matches of either clusters or candidate pairs found during linkage into a single unified structure holding all found matches @@ -67,37 +62,8 @@ def compile_match_lists(match_lists: List[dict], cluster_mode: bool = False): return matches -def eval_perfect_match(feature_comparisons: List, **kwargs) -> bool: - """ - Determines whether a given set of feature comparisons represent a - 'perfect' match (i.e. whether all features that were compared match - in whatever criteria was specified for them). - - :param feature_comparisons: A list of 1s and 0s, one for each feature - that was compared during the match algorithm. - :return: The evaluation of whether the given features all match. - """ - return sum(feature_comparisons) == len(feature_comparisons) - - -def eval_log_odds_cutoff(feature_comparisons: List, **kwargs) -> bool: - """ - Determines whether a given set of feature comparisons matches enough - to be the result of a true patient link instead of just random chance. - This is represented using previously computed log-odds ratios. - - :param feature_comparisons: A list of floats representing the log-odds - score of each field computed on. - :return: Whether the feature comparisons score well enough to be - considered a match. - """ - if "true_match_threshold" not in kwargs: - raise KeyError("Cutoff threshold for true matches must be passed.") - return sum(feature_comparisons) >= kwargs["true_match_threshold"] - - def extract_blocking_values_from_record( - record: dict, blocking_fields: List[dict] + record: dict, blocking_fields: list[dict] ) -> dict: """ Extracts values from a given patient record for eventual use in database @@ -151,7 +117,7 @@ def extract_blocking_values_from_record( block = block_dict.get("value") try: # Apply utility extractor for safe parsing - value = extract_value_with_resource_path( + value = utils.extract_value_with_resource_path( record, LINKING_FIELDS_TO_FHIRPATHS[block], selection_criteria="first", @@ -186,164 +152,6 @@ def extract_blocking_values_from_record( return block_vals -def feature_match_exact( - record_i: List, - record_j: List, - feature_col: str, - col_to_idx: dict[str, int], - **kwargs: dict, -) -> bool: - """ - Determines whether a single feature in a given pair of records - constitutes an exact match (perfect equality). - - :param record_i: One of the records in the candidate pair to evaluate. - :param record_j: The second record in the candidate pair. - :param feature_col: The name of the column being evaluated (e.g. "city"). - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :return: A boolean indicating whether the features are an exact match. - """ - idx = col_to_idx[feature_col] - return record_i[idx] == record_j[idx] - - -def feature_match_four_char( - record_i: List, - record_j: List, - feature_col: str, - col_to_idx: dict[str, int], - **kwargs: dict, -) -> bool: - """ - Determines whether a string feature in a pair of records exactly matches - on the first four characters. - - :param record_i: One of the records in the candidate pair to evaluate. - :param record_j: The second record in the candidate pair. - :param feature_col: The name of the column being evaluated (e.g. "city"). - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :return: A boolean indicating whether the features are a match. - """ - idx = col_to_idx[feature_col] - first_four_i = record_i[idx][: min(4, len(record_i[idx]))] - first_four_j = record_j[idx][: min(4, len(record_j[idx]))] - return first_four_i == first_four_j - - -def feature_match_fuzzy_string( - record_i: List, - record_j: List, - feature_col: str, - col_to_idx: dict[str, int], - **kwargs: dict, -) -> bool: - """ - Determines whether two strings in a given pair of records are close - enough to constitute a partial match. The exact nature of the match - is determined by the specified string comparison function (see - harmonization/utils/compare_strings for more details) as well as a - scoring threshold the comparison must meet or exceed. - - :param record_i: One of the records in the candidate pair to evaluate. - :param record_j: The second record in the candidate pair. - :param feature_col: The name of the column being evaluated (e.g. "city"). - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :param **kwargs: Optionally, a dictionary including specifications for - the string comparison metric to use, as well as the cutoff score - beyond which to classify the strings as a partial match. - :return: A boolean indicating whether the features are a fuzzy match. - """ - idx = col_to_idx[feature_col] - - # Convert datetime obj to str using helper function - if feature_col == "birthdate": - record_i[idx] = datetime_to_str(record_i[idx]) - record_j[idx] = datetime_to_str(record_j[idx]) - - # Special case for two empty strings, since we don't want vacuous - # equality (or in-) to penalize the score - if record_i[idx] == "" and record_j[idx] == "": - return True - if record_i[idx] is None and record_j[idx] is None: - return True - - similarity_measure, threshold = _get_fuzzy_params(feature_col, **kwargs) - score = compare_strings(record_i[idx], record_j[idx], similarity_measure) - return score >= threshold - - -def feature_match_log_odds_exact( - record_i: List, - record_j: List, - feature_col: str, - col_to_idx: dict[str, int], - **kwargs: dict, -) -> float: - """ - Determines whether two feature values in two records should earn the full - log-odds similarity score (i.e. they match exactly) or whether they - should earn no weight (they differ). Used for fields for which fuzzy - comparisons are inappropriate, such as sex. - - :param record_i: One of the records in the candidate pair to evaluate. - :param record_j: The second record in the candidate pair. - :param feature_col: The name of the column being evaluated (e.g. "city"). - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :return: A float of the score the feature comparison earned. - """ - if "log_odds" not in kwargs: - raise KeyError("Mapping of columns to m/u log-odds must be provided.") - col_odds = kwargs["log_odds"][feature_col] - idx = col_to_idx[feature_col] - if record_i[idx] == record_j[idx]: - return col_odds - else: - return 0.0 - - -def feature_match_log_odds_fuzzy_compare( - record_i: List, - record_j: List, - feature_col: str, - col_to_idx: dict[str, int], - **kwargs: dict, -) -> float: - """ - Determines the weighted string-odds similarly score earned by two - feature values in two records, as a function of the pre-computed - log-odds weights and the string similarity between the two features. - This scales the full score that would be earned from a perfect - match to a degree of partial weight appropriate to how similar the - two strings are. - - :param record_i: One of the records in the candidate pair to evaluate. - :param record_j: The second record in the candidate pair. - :param feature_col: The name of the column being evaluated (e.g. "city"). - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :return: A float of the score the feature comparison earned. - """ - if "log_odds" not in kwargs: - raise KeyError("Mapping of columns to m/u log-odds must be provided.") - col_odds = kwargs["log_odds"][feature_col] - idx = col_to_idx[feature_col] - - # Convert datetime obj to str using helper function - if feature_col == "birthdate": - record_i[idx] = datetime_to_str(record_i[idx]) - record_j[idx] = datetime_to_str(record_j[idx]) - - similarity_measure, threshold = _get_fuzzy_params(feature_col, **kwargs) - score = compare_strings(record_i[idx], record_j[idx], similarity_measure) - if score < threshold: - score = 0.0 - return score * col_odds - - def generate_hash_str(linking_identifier: str, salt_str: str) -> str: """ Generates a hash for a given string of concatenated patient information. The hash @@ -363,7 +171,7 @@ def generate_hash_str(linking_identifier: str, salt_str: str) -> str: def link_record_against_mpi( record: dict, - algo_config: List[dict], + algo_config: list[dict], external_person_id: str = None, mpi_client: BaseMPIConnectorClient = None, ) -> tuple[bool, str]: @@ -395,15 +203,7 @@ def link_record_against_mpi( # Need to bind function names back to their symbolic invocations # in context of the module--i.e. turn the string of a function # name back into the callable defined in link.py - - algo_config = copy.deepcopy(algo_config) - logging.info( - f"Starting _bind_func_names_to_invocations at: {datetime.datetime.now().strftime('%m-%d-%yT%H:%M:%S.%f')}" # noqa - ) - algo_config = _bind_func_names_to_invocations(algo_config) - logging.info( - f"Done with _bind_func_names_to_invocations at:{datetime.datetime.now().strftime('%m-%d-%yT%H:%M:%S.%f')}" # noqa - ) + algo_config = [utils.bind_functions(linkage_pass) for linkage_pass in algo_config] # Membership ratios need to persist across linkage passes so that we can # find the highest scoring match across all trials @@ -553,69 +353,7 @@ def load_json_probs(path: pathlib.Path): ) -def match_within_block( - block: List[List], - feature_funcs: dict[str, Callable], - col_to_idx: dict[str, int], - match_eval: Callable, - **kwargs, -) -> List[tuple]: - """ - Performs matching on all candidate pairs of records within a given block - of data. Actual partitioning of the data should be done outside this - function, as it compares all possible pairs within the provided partition. - Uses a given construction of feature comparison rules as well as a - match evaluation rule to determine the final verdict on whether two - records are indeed a match. - - A feature function is of the form "feature_match_X" for some condition - X; it must accept two records (lists of data), an index i in which the - feature to compare is stored, and the parameter **kwargs. It must return - a boolean indicating whether the features "match" for whatever definition - of match the function uses (i.e. this allows modular logic to apply to - different features in the compared records). Note that not all features - in a record need a comparison function defined. - - A match evaluation rule is a function of the form "eval_X" for some - condition X. It accepts as input a list of booleans, one for each feature - that was compared with feature funcs, and determines whether the - comparisons constitute a match according to X. - - :param block: A list of records to check for matches. Each record in - the list is itself a list of features. The first feature of the - record must be an "id" for the record. - :param feature_funcs: A dictionary mapping feature indices to functions - used to evaluate those features for a match. - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :param match_eval: A function for determining whether a given set of - feature comparisons constitutes a match for linkage. - :return: A list of 2-tuples of the form (i,j), where i,j give the indices - in the block of data of records deemed to match. - """ - match_pairs = [] - - # Dynamic programming table: order doesn't matter, so only need to - # check each combo of i,j once - for i, record_i in enumerate(block): - for j in range(i + 1, len(block)): - record_j = block[j] - feature_comps = [ - feature_funcs[feature_col]( - record_i, record_j, feature_col, col_to_idx, **kwargs - ) - for feature_col in feature_funcs - ] - - # If it's a match, store the result - is_match = match_eval(feature_comps, **kwargs) - if is_match: - match_pairs.append((i, j)) - - return match_pairs - - -def read_linkage_config(config_file: pathlib.Path) -> List[dict]: +def read_linkage_config(config_file: pathlib.Path) -> list[dict]: """ Reads and generates a record linkage algorithm configuration list from the provided filepath, which should point to a JSON file. A record @@ -724,7 +462,7 @@ def score_linkage_vs_truth( return (sensitivity, specificity, ppv, f1) -def write_linkage_config(linkage_algo: List[dict], file_to_write: pathlib.Path) -> None: +def write_linkage_config(linkage_algo: list[dict], file_to_write: pathlib.Path) -> None: """ Save a provided algorithm description as a JSON dictionary at the provided filepath location. Algorithm descriptions are lists of dictionaries, one @@ -752,9 +490,9 @@ def write_linkage_config(linkage_algo: List[dict], file_to_write: pathlib.Path) algo_json = [] for rl_pass in linkage_algo: pass_json = {} - pass_json["funcs"] = {col: f.__name__ for (col, f) in rl_pass["funcs"].items()} + pass_json["funcs"] = {col: utils.func_to_str(f) for (col, f) in rl_pass["funcs"].items()} pass_json["blocks"] = rl_pass["blocks"] - pass_json["matching_rule"] = rl_pass["matching_rule"].__name__ + pass_json["matching_rule"] = utils.func_to_str(rl_pass["matching_rule"]) if rl_pass.get("cluster_ratio", None) is not None: pass_json["cluster_ratio"] = rl_pass["cluster_ratio"] if rl_pass.get("kwargs", None) is not None: @@ -767,58 +505,9 @@ def write_linkage_config(linkage_algo: List[dict], file_to_write: pathlib.Path) out.write(json.dumps(linkage_json)) -def _bind_func_names_to_invocations(algo_config: List[dict]): - """ - Helper method that re-maps the string names of functions to their - callable invocations as defined within the `link.py` module. - """ - for lp in algo_config: - feature_funcs = lp["funcs"] - for func in feature_funcs: - if type(feature_funcs[func]) is str: # noqa - feature_funcs[func] = globals()[feature_funcs[func]] - if type(lp["matching_rule"]) is str: # noqa - lp["matching_rule"] = globals()[lp["matching_rule"]] - return algo_config - - -def _eval_record_in_cluster( - block: List[List], - i: int, - cluster: set, - cluster_ratio: float, - feature_funcs: dict[str, Callable], - col_to_idx: dict[str, int], - match_eval: Callable, - **kwargs, -): - """ - A helper function used to evaluate whether a given incoming record - satisfies the matching proportion threshold of an existing cluster, - and therefore would belong to the cluster. - """ - record_i = block[i] - num_matched = 0.0 - for j in cluster: - record_j = block[j] - feature_comps = [ - feature_funcs[feature_col]( - record_i, record_j, feature_col, col_to_idx, **kwargs - ) - for feature_col in feature_funcs - ] - - is_match = match_eval(feature_comps) - if is_match: - num_matched += 1.0 - if (num_matched / len(cluster)) >= cluster_ratio: - return True - return False - - def _compare_records( - record: List, - mpi_patient: List, + record: list, + mpi_patient: list, feature_funcs: dict, col_to_idx: dict[str, int], matching_rule: callable, @@ -847,8 +536,8 @@ def _compare_records( def _compare_records_field_helper( - record: List, - mpi_patient: List, + record: list, + mpi_patient: list, feature_col: str, col_to_idx: dict[str, int], feature_funcs: dict, @@ -869,8 +558,8 @@ def _compare_records_field_helper( def _compare_address_elements( - record: List, - mpi_patient: List, + record: list, + mpi_patient: list, feature_funcs: dict, feature_col: str, col_to_idx: dict[str, int], @@ -893,8 +582,8 @@ def _compare_address_elements( def _compare_name_elements( - record: List, - mpi_patient: List, + record: list, + mpi_patient: list, feature_funcs: dict, feature_col: str, col_to_idx: dict[str, int], @@ -927,7 +616,7 @@ def _condense_extract_address_from_resource(resource: dict, field: str): """ expanded_address_fhirpath = LINKING_FIELDS_TO_FHIRPATHS[field] expanded_address_fhirpath = ".".join(expanded_address_fhirpath.split(".")[:-1]) - list_of_address_objects = extract_value_with_resource_path( + list_of_address_objects = utils.extract_value_with_resource_path( resource, expanded_address_fhirpath, "all" ) if field == "address": @@ -958,7 +647,7 @@ def _find_strongest_link(linkage_scores: dict) -> str: return best_person -def _flatten_patient_resource(resource: dict, col_to_idx: dict) -> List: +def _flatten_patient_resource(resource: dict, col_to_idx: dict) -> list: """ Helper method that flattens an incoming patient resource into a list whose elements are the keys of the FHIR dictionary, reformatted and ordered @@ -986,7 +675,7 @@ def _flatten_patient_field_helper(resource: dict, field: str) -> any: algorithm. """ if field == "first_name": - vals = extract_value_with_resource_path( + vals = utils.extract_value_with_resource_path( resource, LINKING_FIELDS_TO_FHIRPATHS[field], selection_criteria="all" ) return vals if vals is not None else [""] @@ -994,40 +683,13 @@ def _flatten_patient_field_helper(resource: dict, field: str) -> any: vals = _condense_extract_address_from_resource(resource, field) return vals if vals is not None else [""] else: - val = extract_value_with_resource_path( + val = utils.extract_value_with_resource_path( resource, LINKING_FIELDS_TO_FHIRPATHS[field], selection_criteria="first" ) return val if val is not None else "" -def _get_fuzzy_params(col: str, **kwargs) -> tuple[str, str]: - """ - Helper method to quickly determine the appropriate similarity measure - and fuzzy matching threshold to use for fuzzy-comparing a particular - field between two records. - - :param col: The string name of the column being used in a fuzzy - comparison. - :param kwargs: Optionally, a dictionary of keyword arguments containing - values for a similarity metric and appropriate fuzzy thresholds. - :return: A tuple containing the similarity metric to use and the - fuzzy comparison threshold to measure against. - """ - similarity_measure = "JaroWinkler" - if "similarity_measure" in kwargs: - similarity_measure = kwargs["similarity_measure"] - - threshold = 0.7 - if "thresholds" in kwargs: - if col in kwargs["thresholds"]: - threshold = kwargs["thresholds"][col] - elif "threshold" in kwargs: - threshold = kwargs["threshold"] - - return similarity_measure, threshold - - -def _group_patient_block_by_person(data_block: List[list]) -> dict[str, List]: +def _group_patient_block_by_person(data_block: list[list]) -> dict[str, list]: """ Helper method that partitions the block of patient data returned from the MPI into clusters of records according to their linked Person ID. @@ -1042,8 +704,8 @@ def _group_patient_block_by_person(data_block: List[list]) -> dict[str, List]: def _map_matches_to_record_ids( - match_list: Union[List[tuple], List[set]], data_block, cluster_mode: bool = False -) -> List[tuple]: + match_list: Union[list[tuple], list[set]], data_block, cluster_mode: bool = False +) -> list[tuple]: """ Helper function to turn a list of tuples of row indices in a block of data into a list of tuples of the IDs of the records within @@ -1066,71 +728,6 @@ def _map_matches_to_record_ids( return matched_records -def _match_within_block_cluster_ratio( - block: List[List], - cluster_ratio: float, - feature_funcs: dict[str, Callable], - col_to_idx: dict[str, int], - match_eval: Callable, - **kwargs, -) -> List[set]: - """ - A matching function for statistically testing the impact of membership - ratio to the quality of clusters formed. This function behaves similarly - to `match_within_block`, except that rather than identifying all pairwise - candidates which are deemed matches, the function creates a list of - clusters of patients, where each cluster constitutes what would be a - single "representative" patient in the database. The formation of - clusters is determined by the parameter `cluster_ratio`, which defines - the proportion of other records in an existing cluster that a new - incoming record must match in order to join the cluster. - - :param block: A list of records to check for matches. Each record in - the list is itself a list of features. The first feature of the - record must be an "id" for the record. - :param cluster_ratio: A float giving the proportion of records in an - existing cluster that a new incoming record must match in order - to qualify for membership in the cluster. - :param feature_funcs: A dictionary mapping feature indices to functions - used to evaluate those features for a match. - :param col_to_idx: A dictionary mapping column names to the numeric index - in which they occur in order in the data. - :param match_eval: A function for determining whether a given set of - feature comparisons constitutes a match for linkage. - :return: A list of 2-tuples of the form (i,j), where i,j give the indices - in the block of data of records deemed to match. - """ - clusters = [] - for i in range(len(block)): - # Base case - if len(clusters) == 0: - clusters.append({i}) - continue - found_master_cluster = False - - # Iterate through clusters to find one that we match with - for cluster in clusters: - belongs = _eval_record_in_cluster( - block, - i, - cluster, - cluster_ratio, - feature_funcs, - col_to_idx, - match_eval, - **kwargs, - ) - if belongs: - found_master_cluster = True - cluster.add(i) - break - - # Create a new singleton if no other cluster qualified - if not found_master_cluster: - clusters.append({i}) - return clusters - - def _is_empty_extraction_field(block_vals: dict, field: str): """ Helper method that determines when a field extracted from an incoming diff --git a/src/recordlinker/linkage/matchers.py b/src/recordlinker/linkage/matchers.py new file mode 100644 index 0000000..0c73662 --- /dev/null +++ b/src/recordlinker/linkage/matchers.py @@ -0,0 +1,427 @@ +""" +recordlinker.linkage.matchers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This module contains functions for evaluating whether two records are +a match based on the similarity of their features. These functions are +used by the record linkage algorithm to determine whether a candidate +pair of records should be considered a match or not. +""" +import typing + +import rapidfuzz + +from recordlinker.linkage import utils + +SIMILARITY_MEASURES = typing.Literal["JaroWinkler", "Levenshtein", "DamerauLevenshtein"] + + +def compare_strings( + string1: str, + string2: str, + similarity_measure: SIMILARITY_MEASURES = "JaroWinkler", +) -> float: + """ + Returns the normalized similarity measure between string1 and string2, as + determined by the similarlity measure. The higher the normalized similarity measure + (up to 1.0), the more similar string1 and string2 are. A normalized similarity + measure of 0.0 means string1 and string 2 are not at all similar. This function + expects basic text cleaning (e.g. removal of numeric characters, trimming of spaces, + etc.) to already have been performed on the input strings. + + :param string1: First string for comparison. + :param string2: Second string for comparison. + :param similarity_measure: The method used to measure the similarity between two + strings, defaults to "JaroWinkler". + - JaroWinkler: a ratio of matching characters and transpositions needed to + transform string1 into string2. + - Levenshtein: the number of edits (excluding transpositions) needed to transform + string1 into string2. + - DamerauLevenshtein: the number of edits (including transpositions) needed to + transform string1 into string2. + :return: The normalized similarity between string1 and string2, with 0 representing + no similarity between string1 and string2, and 1 meaning string1 and string2 are + dentical words. + """ + if similarity_measure == "JaroWinkler": + return rapidfuzz.distance.JaroWinkler.normalized_similarity(string1, string2) + elif similarity_measure == "Levenshtein": + return rapidfuzz.distance.Levenshtein.normalized_similarity(string1, string2) + elif similarity_measure == "DamerauLevenshtein": + return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity( + string1, string2 + ) + + +def eval_perfect_match(feature_comparisons: list, **kwargs) -> bool: + """ + Determines whether a given set of feature comparisons represent a + 'perfect' match (i.e. whether all features that were compared match + in whatever criteria was specified for them). + + :param feature_comparisons: A list of 1s and 0s, one for each feature + that was compared during the match algorithm. + :return: The evaluation of whether the given features all match. + """ + return sum(feature_comparisons) == len(feature_comparisons) + + +def eval_log_odds_cutoff(feature_comparisons: list, **kwargs) -> bool: + """ + Determines whether a given set of feature comparisons matches enough + to be the result of a true patient link instead of just random chance. + This is represented using previously computed log-odds ratios. + + :param feature_comparisons: A list of floats representing the log-odds + score of each field computed on. + :return: Whether the feature comparisons score well enough to be + considered a match. + """ + if "true_match_threshold" not in kwargs: + raise KeyError("Cutoff threshold for true matches must be passed.") + return sum(feature_comparisons) >= kwargs["true_match_threshold"] + + +def feature_match_exact( + record_i: list, + record_j: list, + feature_col: str, + col_to_idx: dict[str, int], + **kwargs: dict, +) -> bool: + """ + Determines whether a single feature in a given pair of records + constitutes an exact match (perfect equality). + + :param record_i: One of the records in the candidate pair to evaluate. + :param record_j: The second record in the candidate pair. + :param feature_col: The name of the column being evaluated (e.g. "city"). + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :return: A boolean indicating whether the features are an exact match. + """ + idx = col_to_idx[feature_col] + return record_i[idx] == record_j[idx] + + +def feature_match_four_char( + record_i: list, + record_j: list, + feature_col: str, + col_to_idx: dict[str, int], + **kwargs: dict, +) -> bool: + """ + Determines whether a string feature in a pair of records exactly matches + on the first four characters. + + :param record_i: One of the records in the candidate pair to evaluate. + :param record_j: The second record in the candidate pair. + :param feature_col: The name of the column being evaluated (e.g. "city"). + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :return: A boolean indicating whether the features are a match. + """ + idx = col_to_idx[feature_col] + first_four_i = record_i[idx][: min(4, len(record_i[idx]))] + first_four_j = record_j[idx][: min(4, len(record_j[idx]))] + return first_four_i == first_four_j + + +def feature_match_fuzzy_string( + record_i: list, + record_j: list, + feature_col: str, + col_to_idx: dict[str, int], + **kwargs: dict, +) -> bool: + """ + Determines whether two strings in a given pair of records are close + enough to constitute a partial match. The exact nature of the match + is determined by the specified string comparison function (see + compare_strings for more details) as well as a + scoring threshold the comparison must meet or exceed. + + :param record_i: One of the records in the candidate pair to evaluate. + :param record_j: The second record in the candidate pair. + :param feature_col: The name of the column being evaluated (e.g. "city"). + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :param **kwargs: Optionally, a dictionary including specifications for + the string comparison metric to use, as well as the cutoff score + beyond which to classify the strings as a partial match. + :return: A boolean indicating whether the features are a fuzzy match. + """ + idx = col_to_idx[feature_col] + + # Convert datetime obj to str using helper function + if feature_col == "birthdate": + record_i[idx] = utils.datetime_to_str(record_i[idx]) + record_j[idx] = utils.datetime_to_str(record_j[idx]) + + # Special case for two empty strings, since we don't want vacuous + # equality (or in-) to penalize the score + if record_i[idx] == "" and record_j[idx] == "": + return True + if record_i[idx] is None and record_j[idx] is None: + return True + + similarity_measure, threshold = _get_fuzzy_params(feature_col, **kwargs) + score = compare_strings(record_i[idx], record_j[idx], similarity_measure) + return score >= threshold + + +def feature_match_log_odds_exact( + record_i: list, + record_j: list, + feature_col: str, + col_to_idx: dict[str, int], + **kwargs: dict, +) -> float: + """ + Determines whether two feature values in two records should earn the full + log-odds similarity score (i.e. they match exactly) or whether they + should earn no weight (they differ). Used for fields for which fuzzy + comparisons are inappropriate, such as sex. + + :param record_i: One of the records in the candidate pair to evaluate. + :param record_j: The second record in the candidate pair. + :param feature_col: The name of the column being evaluated (e.g. "city"). + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :return: A float of the score the feature comparison earned. + """ + if "log_odds" not in kwargs: + raise KeyError("Mapping of columns to m/u log-odds must be provided.") + col_odds = kwargs["log_odds"][feature_col] + idx = col_to_idx[feature_col] + if record_i[idx] == record_j[idx]: + return col_odds + else: + return 0.0 + + +def feature_match_log_odds_fuzzy_compare( + record_i: list, + record_j: list, + feature_col: str, + col_to_idx: dict[str, int], + **kwargs: dict, +) -> float: + """ + Determines the weighted string-odds similarly score earned by two + feature values in two records, as a function of the pre-computed + log-odds weights and the string similarity between the two features. + This scales the full score that would be earned from a perfect + match to a degree of partial weight appropriate to how similar the + two strings are. + + :param record_i: One of the records in the candidate pair to evaluate. + :param record_j: The second record in the candidate pair. + :param feature_col: The name of the column being evaluated (e.g. "city"). + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :return: A float of the score the feature comparison earned. + """ + if "log_odds" not in kwargs: + raise KeyError("Mapping of columns to m/u log-odds must be provided.") + col_odds = kwargs["log_odds"][feature_col] + idx = col_to_idx[feature_col] + + # Convert datetime obj to str using helper function + if feature_col == "birthdate": + record_i[idx] = utils.datetime_to_str(record_i[idx]) + record_j[idx] = utils.datetime_to_str(record_j[idx]) + + similarity_measure, threshold = _get_fuzzy_params(feature_col, **kwargs) + score = compare_strings(record_i[idx], record_j[idx], similarity_measure) + if score < threshold: + score = 0.0 + return score * col_odds + + +def match_within_block( + block: list[list], + feature_funcs: dict[str, typing.Callable], + col_to_idx: dict[str, int], + match_eval: typing.Callable, + **kwargs, +) -> list[tuple]: + """ + Performs matching on all candidate pairs of records within a given block + of data. Actual partitioning of the data should be done outside this + function, as it compares all possible pairs within the provided partition. + Uses a given construction of feature comparison rules as well as a + match evaluation rule to determine the final verdict on whether two + records are indeed a match. + + A feature function is of the form "feature_match_X" for some condition + X; it must accept two records (lists of data), an index i in which the + feature to compare is stored, and the parameter **kwargs. It must return + a boolean indicating whether the features "match" for whatever definition + of match the function uses (i.e. this allows modular logic to apply to + different features in the compared records). Note that not all features + in a record need a comparison function defined. + + A match evaluation rule is a function of the form "eval_X" for some + condition X. It accepts as input a list of booleans, one for each feature + that was compared with feature funcs, and determines whether the + comparisons constitute a match according to X. + + :param block: A list of records to check for matches. Each record in + the list is itself a list of features. The first feature of the + record must be an "id" for the record. + :param feature_funcs: A dictionary mapping feature indices to functions + used to evaluate those features for a match. + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :param match_eval: A function for determining whether a given set of + feature comparisons constitutes a match for linkage. + :return: A list of 2-tuples of the form (i,j), where i,j give the indices + in the block of data of records deemed to match. + """ + match_pairs: list[tuple] = [] + + # Dynamic programming table: order doesn't matter, so only need to + # check each combo of i,j once + for i, record_i in enumerate(block): + for j in range(i + 1, len(block)): + record_j = block[j] + feature_comps = [ + feature_funcs[feature_col]( + record_i, record_j, feature_col, col_to_idx, **kwargs + ) + for feature_col in feature_funcs + ] + + # If it's a match, store the result + is_match = match_eval(feature_comps, **kwargs) + if is_match: + match_pairs.append((i, j)) + + return match_pairs + + +def match_within_block_cluster_ratio( + block: list[list], + cluster_ratio: float, + feature_funcs: dict[str, typing.Callable], + col_to_idx: dict[str, int], + match_eval: typing.Callable, + **kwargs, +) -> list[set]: + """ + A matching function for statistically testing the impact of membership + ratio to the quality of clusters formed. This function behaves similarly + to `match_within_block`, except that rather than identifying all pairwise + candidates which are deemed matches, the function creates a list of + clusters of patients, where each cluster constitutes what would be a + single "representative" patient in the database. The formation of + clusters is determined by the parameter `cluster_ratio`, which defines + the proportion of other records in an existing cluster that a new + incoming record must match in order to join the cluster. + + :param block: A list of records to check for matches. Each record in + the list is itself a list of features. The first feature of the + record must be an "id" for the record. + :param cluster_ratio: A float giving the proportion of records in an + existing cluster that a new incoming record must match in order + to qualify for membership in the cluster. + :param feature_funcs: A dictionary mapping feature indices to functions + used to evaluate those features for a match. + :param col_to_idx: A dictionary mapping column names to the numeric index + in which they occur in order in the data. + :param match_eval: A function for determining whether a given set of + feature comparisons constitutes a match for linkage. + :return: A list of 2-tuples of the form (i,j), where i,j give the indices + in the block of data of records deemed to match. + """ + clusters: list[set] = [] + for i in range(len(block)): + # Base case + if len(clusters) == 0: + clusters.append({i}) + continue + found_master_cluster = False + + # Iterate through clusters to find one that we match with + for cluster in clusters: + belongs = _eval_record_in_cluster( + block, + i, + cluster, + cluster_ratio, + feature_funcs, + col_to_idx, + match_eval, + **kwargs, + ) + if belongs: + found_master_cluster = True + cluster.add(i) + break + + # Create a new singleton if no other cluster qualified + if not found_master_cluster: + clusters.append({i}) + return clusters + + +def _eval_record_in_cluster( + block: list[list], + i: int, + cluster: set, + cluster_ratio: float, + feature_funcs: dict[str, typing.Callable], + col_to_idx: dict[str, int], + match_eval: typing.Callable, + **kwargs, +) -> bool: + """ + A helper function used to evaluate whether a given incoming record + satisfies the matching proportion threshold of an existing cluster, + and therefore would belong to the cluster. + """ + record_i = block[i] + num_matched = 0.0 + for j in cluster: + record_j = block[j] + feature_comps = [ + feature_funcs[feature_col]( + record_i, record_j, feature_col, col_to_idx, **kwargs + ) + for feature_col in feature_funcs + ] + + is_match = match_eval(feature_comps) + if is_match: + num_matched += 1.0 + + return (num_matched / len(cluster)) >= cluster_ratio + + +def _get_fuzzy_params(col: str, **kwargs) -> tuple[SIMILARITY_MEASURES, float]: + """ + Helper method to quickly determine the appropriate similarity measure + and fuzzy matching threshold to use for fuzzy-comparing a particular + field between two records. + + :param col: The string name of the column being used in a fuzzy + comparison. + :param kwargs: Optionally, a dictionary of keyword arguments containing + values for a similarity metric and appropriate fuzzy thresholds. + :return: A tuple containing the similarity metric to use and the + fuzzy comparison threshold to measure against. + """ + similarity_measure: SIMILARITY_MEASURES = "JaroWinkler" + if "similarity_measure" in kwargs: + similarity_measure = kwargs["similarity_measure"] + + threshold: float = 0.7 + if "thresholds" in kwargs: + if col in kwargs["thresholds"]: + threshold = kwargs["thresholds"][col] + elif "threshold" in kwargs: + threshold = kwargs["threshold"] + + return (similarity_measure, threshold) diff --git a/src/recordlinker/linkage/utils.py b/src/recordlinker/linkage/utils.py index 2eac15e..524eed5 100644 --- a/src/recordlinker/linkage/utils.py +++ b/src/recordlinker/linkage/utils.py @@ -1,16 +1,13 @@ +import copy +import functools +import importlib import json import random +import typing from datetime import date from datetime import datetime -from functools import cache -from typing import Any -from typing import Callable -from typing import List -from typing import Literal -from typing import Union import fhirpathpy -import rapidfuzz # TODO: Not sure if we will need this or not # leaving in utils for now until it's determined that @@ -38,7 +35,7 @@ def datetime_to_str( - input_date: Union[str, date, datetime], include_time: bool = False + input_date: typing.Union[str, date, datetime], include_time: bool = False ) -> str: """ Convert a date or datetime object to a string; if a string is provided, @@ -54,7 +51,7 @@ def datetime_to_str( """ # Handle None or empty string if input_date is None or input_date == "": - return input_date + return "" # if input is str try to check that it follows the expected format if isinstance(input_date, str): @@ -84,52 +81,13 @@ def datetime_to_str( # Originally from phdi/harmonization/utils.py -def compare_strings( - string1: str, - string2: str, - similarity_measure: Literal[ - "JaroWinkler", "Levenshtein", "DamerauLevenshtein" - ] = "JaroWinkler", -) -> float: - """ - Returns the normalized similarity measure between string1 and string2, as - determined by the similarlity measure. The higher the normalized similarity measure - (up to 1.0), the more similar string1 and string2 are. A normalized similarity - measure of 0.0 means string1 and string 2 are not at all similar. This function - expects basic text cleaning (e.g. removal of numeric characters, trimming of spaces, - etc.) to already have been performed on the input strings. - - :param string1: First string for comparison. - :param string2: Second string for comparison. - :param similarity_measure: The method used to measure the similarity between two - strings, defaults to "JaroWinkler". - - JaroWinkler: a ratio of matching characters and transpositions needed to - transform string1 into string2. - - Levenshtein: the number of edits (excluding transpositions) needed to transform - string1 into string2. - - DamerauLevenshtein: the number of edits (including transpositions) needed to - transform string1 into string2. - :return: The normalized similarity between string1 and string2, with 0 representing - no similarity between string1 and string2, and 1 meaning string1 and string2 are - dentical words. - """ - if similarity_measure == "JaroWinkler": - return rapidfuzz.distance.JaroWinkler.normalized_similarity(string1, string2) - elif similarity_measure == "Levenshtein": - return rapidfuzz.distance.Levenshtein.normalized_similarity(string1, string2) - elif similarity_measure == "DamerauLevenshtein": - return rapidfuzz.distance.DamerauLevenshtein.normalized_similarity( - string1, string2 - ) - - -selection_criteria_types = Literal["first", "last", "random", "all"] +selection_criteria_types = typing.Literal["first", "last", "random", "all"] def apply_selection_criteria( - value: List[Any], + value: list[typing.Any], selection_criteria: selection_criteria_types, -) -> str | List: +) -> str | list: """ Returns value(s), according to the selection criteria, from a given list of values parsed from a FHIR resource. A single string value is returned - if the selected @@ -168,8 +126,8 @@ def apply_selection_criteria( def extract_value_with_resource_path( resource: dict, path: str, - selection_criteria: Literal["first", "last", "random", "all"] = "first", -) -> Union[Any, None]: + selection_criteria: selection_criteria_types = "first", +) -> typing.Union[typing.Any, None]: """ Yields a single value from a resource based on a provided `fhir_path`. If the path doesn't map to an extant value in the first, returns @@ -190,8 +148,8 @@ def extract_value_with_resource_path( return value -@cache -def get_fhirpathpy_parser(fhirpath_expression: str) -> Callable: +@functools.cache +def get_fhirpathpy_parser(fhirpath_expression: str) -> typing.Callable: """ Accepts a FHIRPath expression, and returns a callable function which returns the evaluated value at fhirpath_expression for @@ -201,3 +159,45 @@ def get_fhirpathpy_parser(fhirpath_expression: str) -> Callable: will return value at `fhirpath_expression`. """ return fhirpathpy.compile(fhirpath_expression) + + +def bind_functions(data: dict) -> dict: + """ + Binds the functions in the data to the functions in the module. + """ + def _eval_non_list(data): + if isinstance(data, dict): + return bind_functions(data) + elif isinstance(data, str) and data.startswith("func:"): + return str_to_callable(data) + return data + + bound = copy.copy(data) + for key, value in bound.items(): + if isinstance(value, list): + bound[key] = [_eval_non_list(item) for item in value] + else: + bound[key] = _eval_non_list(value) + return bound + + +def str_to_callable(val: str) -> typing.Callable: + """ + Converts a string representation of a function to the function itself. + """ + # Remove the "func:" prefix + if val.startswith("func:"): + val = val[5:] + # Split the string into module path and function name + module_path, func_name = val.rsplit(".", 1) + # Import the module + module = importlib.import_module(module_path) + # Get the function from the module + return getattr(module, func_name) + + +def func_to_str(func: typing.Callable) -> str: + """ + Converts a function to a string representation of the function. + """ + return f"func:{func.__module__}.{func.__name__}" diff --git a/tests/unit/test_linkage.py b/tests/unit/test_linkage.py index 0ab4bf5..d482f9a 100644 --- a/tests/unit/test_linkage.py +++ b/tests/unit/test_linkage.py @@ -3,12 +3,14 @@ import os import pathlib import uuid -from datetime import date -from datetime import datetime from json.decoder import JSONDecodeError import pytest +from sqlalchemy import select +from sqlalchemy import text + from recordlinker.config import settings +from recordlinker.linkage import matchers from recordlinker.linkage.algorithms import DIBBS_BASIC from recordlinker.linkage.algorithms import DIBBS_ENHANCED from recordlinker.linkage.dal import DataAccessLayer @@ -17,28 +19,16 @@ from recordlinker.linkage.link import _condense_extract_address_from_resource from recordlinker.linkage.link import _convert_given_name_to_first_name from recordlinker.linkage.link import _flatten_patient_resource -from recordlinker.linkage.link import _get_fuzzy_params -from recordlinker.linkage.link import _match_within_block_cluster_ratio from recordlinker.linkage.link import add_person_resource -from recordlinker.linkage.link import eval_log_odds_cutoff -from recordlinker.linkage.link import eval_perfect_match from recordlinker.linkage.link import extract_blocking_values_from_record -from recordlinker.linkage.link import feature_match_exact -from recordlinker.linkage.link import feature_match_four_char -from recordlinker.linkage.link import feature_match_fuzzy_string -from recordlinker.linkage.link import feature_match_log_odds_exact -from recordlinker.linkage.link import feature_match_log_odds_fuzzy_compare from recordlinker.linkage.link import generate_hash_str from recordlinker.linkage.link import link_record_against_mpi from recordlinker.linkage.link import load_json_probs -from recordlinker.linkage.link import match_within_block from recordlinker.linkage.link import read_linkage_config from recordlinker.linkage.link import score_linkage_vs_truth from recordlinker.linkage.link import write_linkage_config from recordlinker.linkage.mpi import DIBBsMPIConnectorClient from recordlinker.utils import _clean_up -from sqlalchemy import select -from sqlalchemy import text def _init_db() -> DataAccessLayer: @@ -152,169 +142,6 @@ def test_generate_hash(): assert hash_2 == "102818c623290c24069beb721c6eb465d281b3b67ecfb6aef924d14affa117b9" -def test_feature_match_exact(): - record_i = [1, 0, -1, "blah", "", True] - record_j = [1, 0, -1, "blah", "", True] - record_k = [2, 10, -10, "no match", "null", False] - - cols = {"col_1": 0, "col_2": 1, "col_3": 2, "col_4": 3, "col_5": 4, "col_6": 5} - - # Simultaneously test matches and non-matches of different data types - for c in cols: - assert feature_match_exact(record_i, record_j, c, cols) - assert not feature_match_exact(record_i, record_k, c, cols) - - # Special case for matching None--None == None is vacuous - assert feature_match_exact([None], [None], "col_7", {"col_7": 0}) - - -def test_get_fuzzy_params(): - kwargs = { - "similarity_measure": "Levenshtein", - "thresholds": {"city": 0.95, "address": 0.98}, - } - - assert _get_fuzzy_params("city", **kwargs) == ("Levenshtein", 0.95) - assert _get_fuzzy_params("address", **kwargs) == ("Levenshtein", 0.98) - assert _get_fuzzy_params("first_name", **kwargs) == ("Levenshtein", 0.7) - - del kwargs["similarity_measure"] - - assert _get_fuzzy_params("last_name", **kwargs) == ("JaroWinkler", 0.7) - - -def test_feature_match_fuzzy_string(): - record_i = ["string1", "John", "John", "1985-12-12", None] - record_j = ["string2", "Jhon", "Jon", "1985-12-12", None] - - cols = {"col_1": 0, "col_2": 1, "col_3": 2, "col_4": 3} - - for c in cols: - assert feature_match_fuzzy_string( - record_i, - record_j, - c, - cols, - similarity_measure="JaroWinkler", - threshold=0.7, - ) - assert not feature_match_fuzzy_string( - ["no match"], - ["dont match me bro"], - "col_5", - {"col_5": 0}, - similarity_measure="JaroWinkler", - threshold=0.7, - ) - - -def test_eval_perfect_match(): - assert eval_perfect_match([1, 1, 1]) - assert not eval_perfect_match([1, 1, 0]) - assert not eval_perfect_match([1, 0, 0]) - assert not eval_perfect_match([0, 0, 0]) - - -def test_match_within_block_cluster_ratio(): - data = [ - [1, "John", "Shepard", "11-7-2153", "90909"], - [5, "Jhon", "Sheperd", "11-7-2153", "90909"], - [11, "Jon", "Shepherd", "11-7-2153", "90909"], - [12, "Johnathan", "Shepard", "11-7-2153", "90909"], - [13, "Nathan", "Shepard", "11-7-2153", "90909"], - [14, "Jane", "Smith", "01-10-1986", "12345"], - [18, "Daphne", "Walker", "12-12-1992", "23456"], - [23, "Alejandro", "Villanueve", "1-1-1980", "15935"], - [24, "Alejandro", "Villanueva", "1-1-1980", "15935"], - [27, "Philip", "", "2-2-1990", "64873"], - [31, "Alejandr", "Villanueve", "1-1-1980", "15935"], - [32, "Aelxdrano", "Villanueve", "1-1-1980", "15935"], - ] - - eval_rule = eval_perfect_match - funcs = { - "first_name": feature_match_fuzzy_string, - "last_name": feature_match_fuzzy_string, - "birthdate": feature_match_exact, - "zip": feature_match_exact, - } - col_to_idx = {"first_name": 1, "last_name": 2, "birthdate": 3, "zip": 4} - - # Do a test run requiring total membership match - matches = _match_within_block_cluster_ratio( - data, 1.0, funcs, col_to_idx, eval_rule, threshold=0.8 - ) - assert matches == [{0, 1, 2}, {3}, {4}, {5}, {6}, {7, 8, 10}, {9}, {11}] - - # Now do a test showing different cluster groupings - matches = _match_within_block_cluster_ratio( - data, 0.6, funcs, col_to_idx, eval_rule, threshold=0.8 - ) - assert matches == [{0, 1, 2, 3}, {4}, {5}, {6}, {7, 8, 10, 11}, {9}] - - -def test_match_within_block(): - # Data will be of the form: - # patient_id, first_name, last_name, DOB, zip code - data = [ - [1, "John", "Shepard", "11-7-2153", "90909"], - [5, "Jhon", "Sheperd", "11-7-2153", "90909"], - [11, "Jon", "Shepherd", "11-7-2153", "90909"], - [14, "Jane", "Smith", "01-10-1986", "12345"], - [18, "Daphne", "Walker", "12-12-1992", "23456"], - [23, "Alejandro", "Villanueve", "1-1-1980", "15935"], - [24, "Alejandro", "Villanueva", "1-1-1980", "15935"], - [27, "Philip", "", "2-2-1990", "64873"], - [31, "Alejandr", "Villanueve", "1-1-1980", "15935"], - ] - eval_rule = eval_perfect_match - - # First, require exact matches on everything to match - # Expect 0 pairs - funcs = { - "first_name": feature_match_exact, - "last_name": feature_match_exact, - "birthdate": feature_match_exact, - "zip": feature_match_exact, - } - col_to_idx = {"first_name": 1, "last_name": 2, "birthdate": 3, "zip": 4} - match_pairs = match_within_block(data, funcs, col_to_idx, eval_rule) - assert len(match_pairs) == 0 - - # Now, require exact on DOB and zip, but allow fuzzy on first and last - # Expect 6 matches - funcs["first_name"] = feature_match_fuzzy_string - funcs["last_name"] = feature_match_fuzzy_string - match_pairs = match_within_block(data, funcs, col_to_idx, eval_rule) - assert match_pairs == [(0, 1), (0, 2), (1, 2), (5, 6), (5, 8), (6, 8)] - - # As above, but let's be explicit about string comparison and threshold - # Expect three matches, but none with the "Johns" - # Note the difference in returned results by changing distance function - match_pairs = match_within_block( - data, - funcs, - col_to_idx, - eval_rule, - similarity_measure="Levenshtein", - threshold=0.8, - ) - assert match_pairs == [(5, 6), (5, 8), (6, 8)] - - -def test_feature_match_four_char(): - record_i = ["Johnathan", "Shepard"] - record_j = ["John", "Sheperd"] - record_k = ["Jhon", "Sehpard"] - - cols = {"first": 0, "last": 1} - - # Simultaneously test matches and non-matches of different data types - for c in cols: - assert feature_match_four_char(record_i, record_j, c, cols) - assert not feature_match_four_char(record_i, record_k, c, cols) - - def test_score_linkage_vs_truth(): num_records = 12 matches = { @@ -366,87 +193,6 @@ def test_load_json_probs_errors(): os.remove("not_valid_json.json") -def test_eval_log_odds_cutoff(): - with pytest.raises(KeyError) as e: - eval_log_odds_cutoff([]) - assert "Cutoff threshold for true matches must be passed" in str(e.value) - - assert not eval_log_odds_cutoff([], true_match_threshold=10.0) - assert not eval_log_odds_cutoff([1.0, 0.0, 6.0, 2.7], true_match_threshold=10.0) - assert eval_log_odds_cutoff([4.3, 6.1, 2.5], true_match_threshold=10.0) - - -def test_feature_match_log_odds_exact(): - with pytest.raises(KeyError) as e: - feature_match_log_odds_exact([], [], "c", {}) - assert "Mapping of columns to m/u log-odds must be provided" in str(e.value) - - ri = ["John", "Shepard", "11-07-1980", "1234 Silversun Strip"] - rj = ["John", 6.0, None, "2345 Goldmoon Ave."] - col_to_idx = {"first": 0, "last": 1, "birthdate": 2, "address": 3} - log_odds = {"first": 4.0, "last": 6.5, "birthdate": 9.8, "address": 3.7} - - assert ( - feature_match_log_odds_exact(ri, rj, "first", col_to_idx, log_odds=log_odds) - == 4.0 - ) - - for c in col_to_idx: - if c != "first": - assert ( - feature_match_log_odds_exact(ri, rj, c, col_to_idx, log_odds=log_odds) - == 0.0 - ) - - -def test_feature_match_log_odds_fuzzy(): - with pytest.raises(KeyError) as e: - feature_match_log_odds_fuzzy_compare([], [], "c", {}) - assert "Mapping of columns to m/u log-odds must be provided" in str(e.value) - - ri = ["John", "Shepard", date(1980, 11, 7), "1234 Silversun Strip"] - rj = ["John", "Sheperd", datetime(1970, 6, 7), "asdfghjeki"] - col_to_idx = {"first": 0, "last": 1, "birthdate": 2, "address": 3} - log_odds = {"first": 4.0, "last": 6.5, "birthdate": 9.8, "address": 3.7} - - assert ( - feature_match_log_odds_fuzzy_compare( - ri, rj, "first", col_to_idx, log_odds=log_odds - ) - == 4.0 - ) - - assert ( - round( - feature_match_log_odds_fuzzy_compare( - ri, rj, "last", col_to_idx, log_odds=log_odds - ), - 3, - ) - == 6.129 - ) - - assert ( - round( - feature_match_log_odds_fuzzy_compare( - ri, rj, "birthdate", col_to_idx, log_odds=log_odds - ), - 3, - ) - == 7.859 - ) - - assert ( - round( - feature_match_log_odds_fuzzy_compare( - ri, rj, "address", col_to_idx, log_odds=log_odds - ), - 3, - ) - == 0.0 - ) - - def test_algo_read(): dibbs_basic_algo = read_linkage_config( pathlib.Path(__file__).parent.parent.parent @@ -457,15 +203,15 @@ def test_algo_read(): assert dibbs_basic_algo == [ { "funcs": { - "first_name": "feature_match_fuzzy_string", - "last_name": "feature_match_exact", + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact", }, "blocks": [ {"value": "birthdate"}, {"value": "mrn", "transformation": "last4"}, {"value": "sex"}, ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { @@ -480,8 +226,8 @@ def test_algo_read(): }, { "funcs": { - "address": "feature_match_fuzzy_string", - "birthdate": "feature_match_exact", + "address": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_exact", }, "blocks": [ {"value": "zip"}, @@ -489,7 +235,7 @@ def test_algo_read(): {"value": "last_name", "transformation": "first4"}, {"value": "sex"}, ], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { @@ -513,15 +259,15 @@ def test_algo_read(): assert dibbs_enhanced_algo == [ { "funcs": { - "first_name": "feature_match_log_odds_fuzzy_compare", - "last_name": "feature_match_log_odds_fuzzy_compare", + "first_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "last_name": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", }, "blocks": [ {"value": "birthdate"}, {"value": "mrn", "transformation": "last4"}, {"value": "sex"}, ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", @@ -549,8 +295,8 @@ def test_algo_read(): }, { "funcs": { - "address": "feature_match_log_odds_fuzzy_compare", - "birthdate": "feature_match_log_odds_fuzzy_compare", + "address": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", + "birthdate": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", }, "blocks": [ {"value": "zip"}, @@ -558,7 +304,7 @@ def test_algo_read(): {"value": "last_name", "transformation": "first4"}, {"value": "sex"}, ], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": { "similarity_measure": "JaroWinkler", @@ -603,20 +349,20 @@ def test_algo_write(): sample_algo = [ { "funcs": { - "first_name": feature_match_fuzzy_string, - "last_name": feature_match_exact, + "first_name": matchers.feature_match_fuzzy_string, + "last_name": matchers.feature_match_exact, }, "blocks": ["MRN4", "ADDRESS4"], - "matching_rule": eval_perfect_match, + "matching_rule": matchers.eval_perfect_match, }, { "funcs": { - "last_name": feature_match_four_char, - "sex": feature_match_log_odds_exact, - "address": feature_match_log_odds_fuzzy_compare, + "last_name": matchers.feature_match_four_char, + "sex": matchers.feature_match_log_odds_exact, + "address": matchers.feature_match_log_odds_fuzzy_compare, }, "blocks": ["ZIP", "BIRTH_YEAR"], - "matching_rule": eval_log_odds_cutoff, + "matching_rule": matchers.eval_log_odds_cutoff, "cluster_ratio": 0.9, "kwargs": {"similarity_measure": "Levenshtein", "threshold": 0.85}, }, @@ -630,20 +376,20 @@ def test_algo_write(): assert loaded_algo == [ { "funcs": { - "first_name": "feature_match_fuzzy_string", - "last_name": "feature_match_exact", + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact", }, "blocks": ["MRN4", "ADDRESS4"], - "matching_rule": "eval_perfect_match", + "matching_rule": "func:recordlinker.linkage.matchers.eval_perfect_match", }, { "funcs": { - "last_name": "feature_match_four_char", - "sex": "feature_match_log_odds_exact", - "address": "feature_match_log_odds_fuzzy_compare", + "last_name": "func:recordlinker.linkage.matchers.feature_match_four_char", + "sex": "func:recordlinker.linkage.matchers.feature_match_log_odds_exact", + "address": "func:recordlinker.linkage.matchers.feature_match_log_odds_fuzzy_compare", }, "blocks": ["ZIP", "BIRTH_YEAR"], - "matching_rule": "eval_log_odds_cutoff", + "matching_rule": "func:recordlinker.linkage.matchers.eval_log_odds_cutoff", "cluster_ratio": 0.9, "kwargs": {"similarity_measure": "Levenshtein", "threshold": 0.85}, }, @@ -899,7 +645,7 @@ def test_add_person_resource(): def test_compare_address_elements(): feature_funcs = { - "address": feature_match_four_char, + "address": matchers.feature_match_four_char, } col_to_idx = {"address": 2} record = [ @@ -948,7 +694,7 @@ def test_compare_address_elements(): def test_compare_name_elements(): - feature_funcs = {"first": feature_match_fuzzy_string} + feature_funcs = {"first": matchers.feature_match_fuzzy_string} col_to_idx = {"first": 0} record = [ "123", diff --git a/tests/unit/test_linkage_utils.py b/tests/unit/test_linkage_utils.py index 0d53cfa..44f3e5e 100644 --- a/tests/unit/test_linkage_utils.py +++ b/tests/unit/test_linkage_utils.py @@ -2,7 +2,9 @@ from datetime import datetime import pytest -from recordlinker.linkage.link import datetime_to_str + +from recordlinker.linkage import matchers +from recordlinker.linkage import utils @pytest.mark.parametrize( @@ -14,7 +16,7 @@ ], ) def test_valid_datetime_to_str(input_value, expected_output): - assert datetime_to_str(input_value) == expected_output + assert utils.datetime_to_str(input_value) == expected_output @pytest.mark.parametrize( @@ -26,18 +28,79 @@ def test_valid_datetime_to_str(input_value, expected_output): ], ) def test_valid_datetime_to_str_with_time(input_value, expected_output): - assert datetime_to_str(input_value, include_time=True) == expected_output + assert utils.datetime_to_str(input_value, include_time=True) == expected_output @pytest.mark.parametrize( "input_value, expected_output", [ ("", ""), - (None, None), + (None, ""), (20231010, "20231010"), (["2023-10-10"], "['2023-10-10']"), ({"date": "2023-10-10"}, "{'date': '2023-10-10'}"), ], ) def test_bad_input_datetime_to_str(input_value, expected_output): - assert datetime_to_str(input_value) == expected_output + assert utils.datetime_to_str(input_value) == expected_output + + +def test_bind_functions(): + funcs = { + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact", + } + assert utils.bind_functions(funcs) == { + "first_name": matchers.feature_match_fuzzy_string, + "last_name": matchers.feature_match_exact, + } + + funcs = { + "blocks": [ + {"value": "birthdate"}, + {"value": "func:recordlinker.linkage.matchers.feature_match_exact"}, + ] + } + assert utils.bind_functions(funcs) == { + "blocks": [ + {"value": "birthdate"}, + {"value": matchers.feature_match_exact}, + ] + } + + funcs = { + "nested": { + "first_name": "func:recordlinker.linkage.matchers.feature_match_fuzzy_string", + "last_name": "func:recordlinker.linkage.matchers.feature_match_exact", + } + } + assert utils.bind_functions(funcs) == { + "nested": { + "first_name": matchers.feature_match_fuzzy_string, + "last_name": matchers.feature_match_exact, + } + } + + +def test_str_to_callable(): + val = "func:recordlinker.linkage.matchers.feature_match_exact" + assert utils.str_to_callable(val) == matchers.feature_match_exact + val = "recordlinker.linkage.matchers.feature_match_exact" + assert utils.str_to_callable(val) == matchers.feature_match_exact + val = "recordlinker.unknown_module.unknown_function" + with pytest.raises(ImportError): + utils.str_to_callable(val) + val = "recordlinker.linkage.matchers.unknown_function" + with pytest.raises(AttributeError): + utils.str_to_callable(val) + + +def test_func_to_str(): + assert ( + utils.func_to_str(matchers.feature_match_exact) + == "func:recordlinker.linkage.matchers.feature_match_exact" + ) + assert ( + utils.func_to_str(matchers.feature_match_fuzzy_string) + == "func:recordlinker.linkage.matchers.feature_match_fuzzy_string" + ) diff --git a/tests/unit/test_matchers.py b/tests/unit/test_matchers.py new file mode 100644 index 0000000..98de91f --- /dev/null +++ b/tests/unit/test_matchers.py @@ -0,0 +1,255 @@ +""" +unit.test_matchers +~~~~~~~~~~~~~~~~~~ + +This module contains unit tests for the :mod:`~recordlinker.linkage.matchers` module. +""" +import datetime + +import pytest + +from recordlinker.linkage import matchers + + +def test_get_fuzzy_params(): + kwargs = { + "similarity_measure": "Levenshtein", + "thresholds": {"city": 0.95, "address": 0.98}, + } + + assert matchers._get_fuzzy_params("city", **kwargs) == ("Levenshtein", 0.95) + assert matchers._get_fuzzy_params("address", **kwargs) == ("Levenshtein", 0.98) + assert matchers._get_fuzzy_params("first_name", **kwargs) == ("Levenshtein", 0.7) + + del kwargs["similarity_measure"] + + assert matchers._get_fuzzy_params("last_name", **kwargs) == ("JaroWinkler", 0.7) + + +def test_feature_match_fuzzy_string(): + record_i = ["string1", "John", "John", "1985-12-12", None] + record_j = ["string2", "Jhon", "Jon", "1985-12-12", None] + + cols = {"col_1": 0, "col_2": 1, "col_3": 2, "col_4": 3} + + for c in cols: + assert matchers.feature_match_fuzzy_string( + record_i, + record_j, + c, + cols, + similarity_measure="JaroWinkler", + threshold=0.7, + ) + assert not matchers.feature_match_fuzzy_string( + ["no match"], + ["dont match me bro"], + "col_5", + {"col_5": 0}, + similarity_measure="JaroWinkler", + threshold=0.7, + ) + + +def test_eval_perfect_match(): + assert matchers.eval_perfect_match([1, 1, 1]) + assert not matchers.eval_perfect_match([1, 1, 0]) + assert not matchers.eval_perfect_match([1, 0, 0]) + assert not matchers.eval_perfect_match([0, 0, 0]) + + +def test_match_within_block_cluster_ratio(): + data = [ + [1, "John", "Shepard", "11-7-2153", "90909"], + [5, "Jhon", "Sheperd", "11-7-2153", "90909"], + [11, "Jon", "Shepherd", "11-7-2153", "90909"], + [12, "Johnathan", "Shepard", "11-7-2153", "90909"], + [13, "Nathan", "Shepard", "11-7-2153", "90909"], + [14, "Jane", "Smith", "01-10-1986", "12345"], + [18, "Daphne", "Walker", "12-12-1992", "23456"], + [23, "Alejandro", "Villanueve", "1-1-1980", "15935"], + [24, "Alejandro", "Villanueva", "1-1-1980", "15935"], + [27, "Philip", "", "2-2-1990", "64873"], + [31, "Alejandr", "Villanueve", "1-1-1980", "15935"], + [32, "Aelxdrano", "Villanueve", "1-1-1980", "15935"], + ] + + eval_rule = matchers.eval_perfect_match + funcs = { + "first_name": matchers.feature_match_fuzzy_string, + "last_name": matchers.feature_match_fuzzy_string, + "birthdate": matchers.feature_match_exact, + "zip": matchers.feature_match_exact, + } + col_to_idx = {"first_name": 1, "last_name": 2, "birthdate": 3, "zip": 4} + + # Do a test run requiring total membership match + matches = matchers.match_within_block_cluster_ratio( + data, 1.0, funcs, col_to_idx, eval_rule, threshold=0.8 + ) + assert matches == [{0, 1, 2}, {3}, {4}, {5}, {6}, {7, 8, 10}, {9}, {11}] + + # Now do a test showing different cluster groupings + matches = matchers.match_within_block_cluster_ratio( + data, 0.6, funcs, col_to_idx, eval_rule, threshold=0.8 + ) + assert matches == [{0, 1, 2, 3}, {4}, {5}, {6}, {7, 8, 10, 11}, {9}] + + +def test_match_within_block(): + # Data will be of the form: + # patient_id, first_name, last_name, DOB, zip code + data = [ + [1, "John", "Shepard", "11-7-2153", "90909"], + [5, "Jhon", "Sheperd", "11-7-2153", "90909"], + [11, "Jon", "Shepherd", "11-7-2153", "90909"], + [14, "Jane", "Smith", "01-10-1986", "12345"], + [18, "Daphne", "Walker", "12-12-1992", "23456"], + [23, "Alejandro", "Villanueve", "1-1-1980", "15935"], + [24, "Alejandro", "Villanueva", "1-1-1980", "15935"], + [27, "Philip", "", "2-2-1990", "64873"], + [31, "Alejandr", "Villanueve", "1-1-1980", "15935"], + ] + eval_rule = matchers.eval_perfect_match + + # First, require exact matches on everything to match + # Expect 0 pairs + funcs = { + "first_name": matchers.feature_match_exact, + "last_name": matchers.feature_match_exact, + "birthdate": matchers.feature_match_exact, + "zip": matchers.feature_match_exact, + } + col_to_idx = {"first_name": 1, "last_name": 2, "birthdate": 3, "zip": 4} + match_pairs = matchers.match_within_block(data, funcs, col_to_idx, eval_rule) + assert len(match_pairs) == 0 + + # Now, require exact on DOB and zip, but allow fuzzy on first and last + # Expect 6 matches + funcs["first_name"] = matchers.feature_match_fuzzy_string + funcs["last_name"] = matchers.feature_match_fuzzy_string + match_pairs = matchers.match_within_block(data, funcs, col_to_idx, eval_rule) + assert match_pairs == [(0, 1), (0, 2), (1, 2), (5, 6), (5, 8), (6, 8)] + + # As above, but let's be explicit about string comparison and threshold + # Expect three matches, but none with the "Johns" + # Note the difference in returned results by changing distance function + match_pairs = matchers.match_within_block( + data, + funcs, + col_to_idx, + eval_rule, + similarity_measure="Levenshtein", + threshold=0.8, + ) + assert match_pairs == [(5, 6), (5, 8), (6, 8)] + + +def test_feature_match_four_char(): + record_i = ["Johnathan", "Shepard"] + record_j = ["John", "Sheperd"] + record_k = ["Jhon", "Sehpard"] + + cols = {"first": 0, "last": 1} + + # Simultaneously test matches and non-matches of different data types + for c in cols: + assert matchers.feature_match_four_char(record_i, record_j, c, cols) + assert not matchers.feature_match_four_char(record_i, record_k, c, cols) + + +def test_feature_match_exact(): + record_i = [1, 0, -1, "blah", "", True] + record_j = [1, 0, -1, "blah", "", True] + record_k = [2, 10, -10, "no match", "null", False] + + cols = {"col_1": 0, "col_2": 1, "col_3": 2, "col_4": 3, "col_5": 4, "col_6": 5} + + # Simultaneously test matches and non-matches of different data types + for c in cols: + assert matchers.feature_match_exact(record_i, record_j, c, cols) + assert not matchers.feature_match_exact(record_i, record_k, c, cols) + + # Special case for matching None--None == None is vacuous + assert matchers.feature_match_exact([None], [None], "col_7", {"col_7": 0}) + + +def test_eval_log_odds_cutoff(): + with pytest.raises(KeyError) as e: + matchers.eval_log_odds_cutoff([]) + assert "Cutoff threshold for true matches must be passed" in str(e.value) + + assert not matchers.eval_log_odds_cutoff([], true_match_threshold=10.0) + assert not matchers.eval_log_odds_cutoff([1.0, 0.0, 6.0, 2.7], true_match_threshold=10.0) + assert matchers.eval_log_odds_cutoff([4.3, 6.1, 2.5], true_match_threshold=10.0) + + +def test_feature_match_log_odds_exact(): + with pytest.raises(KeyError) as e: + matchers.feature_match_log_odds_exact([], [], "c", {}) + assert "Mapping of columns to m/u log-odds must be provided" in str(e.value) + + ri = ["John", "Shepard", "11-07-1980", "1234 Silversun Strip"] + rj = ["John", 6.0, None, "2345 Goldmoon Ave."] + col_to_idx = {"first": 0, "last": 1, "birthdate": 2, "address": 3} + log_odds = {"first": 4.0, "last": 6.5, "birthdate": 9.8, "address": 3.7} + + assert ( + matchers.feature_match_log_odds_exact(ri, rj, "first", col_to_idx, log_odds=log_odds) + == 4.0 + ) + + for c in col_to_idx: + if c != "first": + assert ( + matchers.feature_match_log_odds_exact(ri, rj, c, col_to_idx, log_odds=log_odds) + == 0.0 + ) + + +def test_feature_match_log_odds_fuzzy(): + with pytest.raises(KeyError) as e: + matchers.feature_match_log_odds_fuzzy_compare([], [], "c", {}) + assert "Mapping of columns to m/u log-odds must be provided" in str(e.value) + + ri = ["John", "Shepard", datetime.date(1980, 11, 7), "1234 Silversun Strip"] + rj = ["John", "Sheperd", datetime.datetime(1970, 6, 7), "asdfghjeki"] + col_to_idx = {"first": 0, "last": 1, "birthdate": 2, "address": 3} + log_odds = {"first": 4.0, "last": 6.5, "birthdate": 9.8, "address": 3.7} + + assert ( + matchers.feature_match_log_odds_fuzzy_compare( + ri, rj, "first", col_to_idx, log_odds=log_odds + ) + == 4.0 + ) + + assert ( + round( + matchers.feature_match_log_odds_fuzzy_compare( + ri, rj, "last", col_to_idx, log_odds=log_odds + ), + 3, + ) + == 6.129 + ) + + assert ( + round( + matchers.feature_match_log_odds_fuzzy_compare( + ri, rj, "birthdate", col_to_idx, log_odds=log_odds + ), + 3, + ) + == 7.859 + ) + + assert ( + round( + matchers.feature_match_log_odds_fuzzy_compare( + ri, rj, "address", col_to_idx, log_odds=log_odds + ), + 3, + ) + == 0.0 + )