Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Athena: Running predict link_only when one input is empty should return an empty result set, or raise an error if an empty input is not valid #2496

Open
2 tasks done
alanakilleen opened this issue Nov 6, 2024 · 0 comments
Labels
bug Something isn't working

Comments

@alanakilleen
Copy link

What happens?

Up to version 3.9.9 running predict``link_only when one input had no records produced an empty result set. From version 3.9.10+, instead it appears that the non-empty input is linked to itself.

Empty input producing empty result seems like the correct behavior. If an empty input data set is not a valid input, an error should be raised.

To Reproduce

Splink Version 3 Repro

Note - Update the region if needed and replace <bucket name> with the bucket you want to use.

# Print the Splink version
import splink
print(splink.__version__)

# Set some variables
database_name = "athena_timeout_testing"
s3_bucket = "<bucket name>"
boto3_region= "us-east-2"

# Create test DataFrames
import pandas as pd
data = {'id': [0, 1, 2, 3],
        'First Name': ['Niall', 'Harry', 'Louis', 'Zayne'],
        'Last Name': ['Horan', 'Styles', 'Tomlinsin', 'Malik']}
df = pd.DataFrame(data)
df= df.astype("string")
df_empty = df.head(0)

# Create a test database in Athena
import boto3
import awswrangler as wr
boto3_session = boto3.Session(region_name=boto3_region)
wr.catalog.create_database(database_name, boto3_session=boto3_session, exist_ok=True)

# Write the Dataframes to Athena
wr.s3.to_parquet(
    boto3_session=boto3_session,
    df=df,
    path=f"s3://{s3_bucket}/test-table/",
    dataset=True,
    mode="overwrite",
    database=database_name,
    table='sample_data'
)
wr.s3.to_parquet(
    boto3_session=boto3_session,
    df=df_empty,
    path=f"s3://{s3_bucket}/test-table-empty/",
    dataset=True,
    mode="overwrite",
    database=database_name,
    table='sample_empty'
)

# Create some example settings
import splink.athena.athena_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.first_name = r.first_name and l.last_name = r.last_name",
    ],
    "comparisons": [
        cl.levenshtein_at_thresholds("first_name", [1,2], term_frequency_adjustments=True),
    ],
    "unique_id_column_name": "id",
}

# Create an AnthenaLinker and run predict (link_only) for the two Dataframes
from splink.athena.athena_linker import AthenaLinker
linker_compare_to_empty = AthenaLinker(
    input_table_or_tables=['sample_data', 'sample_empty'],
    settings_dict=settings,
    boto3_session=boto3_session,
    output_database=database_name,
    output_bucket=s3_bucket,
)
sample1_vs_empty_results = linker_compare_to_empty.predict()

# Get the output
sample1_vs_empty_results.as_record_dict()

Expected Output

[]

Actual Output

[{'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '0',
  'id_r': '0',
  'first_name_l': 'Niall',
  'first_name_r': 'Niall',
  'gamma_first_name': 3,
  'last_name_l': 'Horan',
  'last_name_r': 'Horan'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '3',
  'id_r': '3',
  'first_name_l': 'Zayne',
  'first_name_r': 'Zayne',
  'gamma_first_name': 3,
  'last_name_l': 'Malik',
  'last_name_r': 'Malik'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '1',
  'id_r': '1',
  'first_name_l': 'Harry',
  'first_name_r': 'Harry',
  'gamma_first_name': 3,
  'last_name_l': 'Styles',
  'last_name_r': 'Styles'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '2',
  'id_r': '2',
  'first_name_l': 'Louis',
  'first_name_r': 'Louis',
  'gamma_first_name': 3,
  'last_name_l': 'Tomlinsin',
  'last_name_r': 'Tomlinsin'}]

Splink Version 4 Repro:

Note - Update the region if needed and replace <bucket name> with the bucket you want to use.

# Print the Splink version
import splink
print(splink.__version__)

# Set some variables
database_name = "athena_timeout_testing"
s3_bucket = "<bucket name>"
boto3_region= "us-east-2"

# Create test DataFrames
import pandas as pd
data = {'id': [0, 1, 2, 3],
        'First Name': ['Niall', 'Harry', 'Louis', 'Zayne'],
        'Last Name': ['Horan', 'Styles', 'Tomlinsin', 'Malik']}
df = pd.DataFrame(data)
df= df.astype("string")
df_empty = df.head(0)

# Create a test database in Athena
import boto3
import awswrangler as wr
boto3_session = boto3.Session(region_name=boto3_region)
wr.catalog.create_database(database_name, boto3_session=boto3_session, exist_ok=True)

# Write the Dataframes to Athena
wr.s3.to_parquet(
    boto3_session=boto3_session,
    df=df,
    path=f"s3://{s3_bucket}/test-table/",
    dataset=True,
    mode="overwrite",
    database=database_name,
    table='sample_data'
)
wr.s3.to_parquet(
    boto3_session=boto3_session,
    df=df_empty,
    path=f"s3://{s3_bucket}/test-table-empty/",
    dataset=True,
    mode="overwrite",
    database=database_name,
    table='sample_empty'
)

# Create some example settings
import splink.comparison_library as cl
from splink import SettingsCreator, block_on
settings = SettingsCreator(
    link_type = "link_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name", "last_name")
    ],
    comparisons=[
        cl.LevenshteinAtThresholds("first_name", [1,2]),
    ],
    unique_id_column_name = "id"
)

# Create an AnthenaLinker and run predict (link_only) for the two Dataframes
from splink.backends.athena import AthenaAPI
db_api = AthenaAPI(
    boto3_session=boto3_session,
    output_database=database_name,
    output_bucket=s3_bucket,
)

from splink import Linker
linker_compare_to_empty = Linker([df_empty, df], settings, db_api)
sample1_vs_empty_results = linker_compare_to_empty.inference.predict()

# Get the output
sample1_vs_empty_results.as_record_dict()

Expected Output

[]

Actual Output

[{'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '0',
  'id_r': '0',
  'first_name_l': 'Niall',
  'first_name_r': 'Niall',
  'gamma_first_name': 3,
  'last_name_l': 'Horan',
  'last_name_r': 'Horan'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '3',
  'id_r': '3',
  'first_name_l': 'Zayne',
  'first_name_r': 'Zayne',
  'gamma_first_name': 3,
  'last_name_l': 'Malik',
  'last_name_r': 'Malik'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '1',
  'id_r': '1',
  'first_name_l': 'Harry',
  'first_name_r': 'Harry',
  'gamma_first_name': 3,
  'last_name_l': 'Styles',
  'last_name_r': 'Styles'},
 {'match_weight': -3.2875681028314045,
  'match_probability': 0.09289667059784089,
  'source_dataset_l': '__splink__input_table_1',
  'source_dataset_r': '__splink__input_table_1',
  'id_l': '2',
  'id_r': '2',
  'first_name_l': 'Louis',
  'first_name_r': 'Louis',
  'gamma_first_name': 3,
  'last_name_l': 'Tomlinsin',
  'last_name_r': 'Tomlinsin'}]

OS:

macOS / Athena

Splink version:

4.0.4

Have you tried this on the latest master branch?

  • I agree

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

  • I agree
@alanakilleen alanakilleen added the bug Something isn't working label Nov 6, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

1 participant