diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml index 5639069..2f73268 100644 --- a/.github/workflows/minimum.yml +++ b/.github/workflows/minimum.yml @@ -19,7 +19,13 @@ jobs: python-version: '3.12' steps: - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} for arm64 + if: ${{ matrix.os == 'macos-latest' && matrix.python-version == '3.8' }} + uses: "gabrielfalcao/pyenv-action@v17" + with: + default: ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }} + if: ${{ matrix.os != 'macos-latest' || matrix.python-version != '3.8' }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/HISTORY.md b/HISTORY.md index 6e9fd25..56c60f9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,19 @@ # History +## v0.9.0 - 2024-08-07 + +This release enables the diagnostic score to be computed in a benchmarking run. It also renames the `IndependentSynthesizer` to `ColumnSynthesizer`. Finally, it fixes a bug so that the time for all metrics will now be used to compute the `Evaluate_Time` column in the results. + +### Bugs Fixed + +* Cap numpy to less than 2.0.0 until SDGym supports - Issue [#313](https://github.com/sdv-dev/SDGym/issues/313) by @gsheni +* The returned `Evaluate_Time` does not include results from all metrics - Issue [#310](https://github.com/sdv-dev/SDGym/issues/310) by @lajohn4747 + +### New Features + +* Rename `IndependentSynthesizer` to `ColumnSynthesizer` - Issue [#319](https://github.com/sdv-dev/SDGym/issues/319) by @lajohn4747 +* Allow the ability to compute diagnostic score in a benchmarking run - Issue [#311](https://github.com/sdv-dev/SDGym/issues/311) by @lajohn4747 + ## v0.8.0 - 2024-06-07 This release adds support for both Python 3.11 and 3.12! It also drops support for Python 3.7. diff --git a/Makefile b/Makefile index fd1fe76..5a819f1 100644 --- a/Makefile +++ b/Makefile @@ -84,8 +84,8 @@ lint: .PHONY: fix-lint fix-lint: - ruff check --fix . - ruff format + invoke fix-lint + # TEST TARGETS .PHONY: test-unit diff --git a/latest_requirements.txt b/latest_requirements.txt index b6b288e..00432e8 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -4,11 +4,11 @@ compress-pickle==2.1.0 humanfriendly==10.0 numpy==1.26.4 pandas==2.2.2 -rdt==1.12.1 -scikit-learn==1.5.0 +rdt==1.12.2 +scikit-learn==1.5.1 scipy==1.13.1 -sdmetrics==0.14.1 -sdv==1.13.1 +sdmetrics==0.15.0 +sdv==1.15.0 tabulate==0.8.10 -torch==2.3.0 +torch==2.3.1 tqdm==4.66.4 diff --git a/pyproject.toml b/pyproject.toml index 7f3954a..9e73158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,9 @@ dependencies = [ 'botocore>=1.31,<2', 'compress-pickle>=1.2.0', 'humanfriendly>=8.2', - "numpy>=1.21.0;python_version<'3.10'", - "numpy>=1.23.3,<2;python_version>='3.10' and python_version<'3.12'", - "numpy>=1.26.0,<2;python_version>='3.12'", + "numpy>=1.21.0,<2.0.0;python_version<'3.10'", + "numpy>=1.23.3,<2.0.0;python_version>='3.10' and python_version<'3.12'", + "numpy>=1.26.0,<2.0.0;python_version>='3.12'", "pandas>=1.4.0;python_version<'3.11'", "pandas>=1.5.0;python_version>='3.11' and python_version<'3.12'", "pandas>=2.1.1;python_version>='3.12'", @@ -133,7 +133,7 @@ namespaces = false version = {attr = 'sdgym.__version__'} [tool.bumpversion] -current_version = "0.8.0" +current_version = "0.9.0.dev1" parse = '(?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?' serialize = [ '{major}.{minor}.{patch}.{release}{candidate}', @@ -179,7 +179,8 @@ exclude = [ ".tox", ".git", "__pycache__", - ".ipynb_checkpoints" + ".ipynb_checkpoints", + "tasks.py", ] [tool.ruff.lint] @@ -189,14 +190,22 @@ select = [ # Pycodestyle "E", "W", - "D200", + # pydocstyle + "D", # isort "I001", + # print statements + "T201", + # pandas-vet + "PD" ] ignore = [ "E501", + # pydocstyle "D107", # Missing docstring in __init__ "D417", # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449 + "PD901", + "PD101", ] [tool.ruff.format] @@ -206,14 +215,18 @@ preview = true docstring-code-format = true docstring-code-line-length = "dynamic" -[tool.ruff.lint.pep8-naming] -extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"] - [tool.ruff.lint.isort] known-first-party = ["sdgym"] +lines-between-types = 0 [tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"] +"errors.py" = ["D105"] +"tests/**.py" = ["D"] [tool.ruff.lint.pydocstyle] -convention = "google" \ No newline at end of file +convention = "google" + +[tool.ruff.lint.pycodestyle] +max-doc-length = 100 +max-line-length = 100 diff --git a/sdgym/__init__.py b/sdgym/__init__.py index 90fa459..4459e6c 100644 --- a/sdgym/__init__.py +++ b/sdgym/__init__.py @@ -8,7 +8,7 @@ __copyright__ = 'Copyright (c) 2022 DataCebo, Inc.' __email__ = 'info@sdv.dev' __license__ = 'BSL-1.1' -__version__ = '0.8.0' +__version__ = '0.9.0.dev1' import logging diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 6a29696..95aedc9 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -15,8 +15,18 @@ import numpy as np import pandas as pd import tqdm -from sdmetrics.reports.multi_table import QualityReport as MultiTableQualityReport -from sdmetrics.reports.single_table import QualityReport as SingleTableQualityReport +from sdmetrics.reports.multi_table import ( + DiagnosticReport as MultiTableDiagnosticReport, +) +from sdmetrics.reports.multi_table import ( + QualityReport as MultiTableQualityReport, +) +from sdmetrics.reports.single_table import ( + DiagnosticReport as SingleTableDiagnosticReport, +) +from sdmetrics.reports.single_table import ( + QualityReport as SingleTableQualityReport, +) from sdgym.datasets import get_dataset_paths, load_dataset from sdgym.errors import SDGymError @@ -88,6 +98,7 @@ def _generate_job_args_list( detailed_results_folder, timeout, compute_quality_score, + compute_diagnostic_score, synthesizers, custom_synthesizers, ): @@ -124,6 +135,7 @@ def _generate_job_args_list( detailed_results_folder, timeout, compute_quality_score, + compute_diagnostic_score, dataset.name, 'single_table', ) @@ -164,6 +176,7 @@ def _compute_scores( metadata, output, compute_quality_score, + compute_diagnostic_score, modality, dataset_name, ): @@ -177,7 +190,8 @@ def _compute_scores( 'metric': metric_name, 'error': 'Metric Timeout', }) - output['scores'] = scores # re-inject list to multiprocessing output + # re-inject list to multiprocessing output + output['scores'] = scores error = None score = None @@ -200,7 +214,19 @@ def _compute_scores( 'error': error, 'metric_time': (datetime.utcnow() - start).total_seconds(), }) - output['scores'] = scores # re-inject list to multiprocessing output + # re-inject list to multiprocessing output + output['scores'] = scores + + if compute_diagnostic_score: + start = datetime.utcnow() + if modality == 'single_table': + diagnostic_report = SingleTableDiagnosticReport() + else: + diagnostic_report = MultiTableDiagnosticReport() + + diagnostic_report.generate(real_data, synthetic_data, metadata, verbose=False) + output['diagnostic_score_time'] = (datetime.utcnow() - start).total_seconds() + output['diagnostic_score'] = diagnostic_report.get_score() if compute_quality_score: start = datetime.utcnow() @@ -221,6 +247,7 @@ def _score( metrics, output=None, compute_quality_score=False, + compute_diagnostic_score=False, modality=None, dataset_name=None, ): @@ -239,7 +266,8 @@ def _score( ) output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB - output['error'] = 'Synthesizer Timeout' # To be deleted if there is no error + # To be deleted if there is no error + output['error'] = 'Synthesizer Timeout' synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize( synthesizer, data.copy(), metadata ) @@ -258,7 +286,8 @@ def _score( used_memory(), ) - del output['error'] # No error so far. _compute_scores tracks its own errors by metric + # No error so far. _compute_scores tracks its own errors by metric + del output['error'] _compute_scores( metrics, data, @@ -266,6 +295,7 @@ def _score( metadata, output, compute_quality_score, + compute_diagnostic_score, modality, dataset_name, ) @@ -295,6 +325,7 @@ def _score_with_timeout( metadata, metrics, compute_quality_score=False, + compute_diagnostic_score=False, modality=None, dataset_name=None, ): @@ -309,6 +340,7 @@ def _score_with_timeout( metrics, output, compute_quality_score, + compute_diagnostic_score, modality, dataset_name, ), @@ -325,15 +357,26 @@ def _score_with_timeout( return output -def _format_output(output, name, dataset_name, compute_quality_score, cache_dir): - evaluate_time = None - if 'scores' in output or 'quality_score_time' in output: - evaluate_time = output.get('quality_score_time', 0) +def _format_output( + output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir +): + evaluate_time = 0 + if 'quality_score_time' in output: + evaluate_time += output.get('quality_score_time', 0) + if 'diagnostic_score_time' in output: + evaluate_time += output.get('diagnostic_score_time', 0) for score in output.get('scores', []): - if score['metric'] == 'NewRowSynthesis': + if 'metric_time' in score and not np.isnan(score['metric_time']): evaluate_time += score['metric_time'] + if ( + 'quality_score_time' not in output + and 'scores' not in output + and 'diagnostic_score_time' not in output + ): + evaluate_time = None + scores = pd.DataFrame({ 'Synthesizer': [name], 'Dataset': [dataset_name], @@ -345,6 +388,9 @@ def _format_output(output, name, dataset_name, compute_quality_score, cache_dir) 'Evaluate_Time': [evaluate_time], }) + if compute_diagnostic_score: + scores.insert(len(scores.columns), 'Diagnostic_Score', output.get('diagnostic_score')) + if compute_quality_score: scores.insert(len(scores.columns), 'Quality_Score', output.get('quality_score')) @@ -381,6 +427,7 @@ def _run_job(args): cache_dir, timeout, compute_quality_score, + compute_diagnostic_score, dataset_name, modality, ) = args @@ -404,6 +451,7 @@ def _run_job(args): metadata=metadata, metrics=metrics, compute_quality_score=compute_quality_score, + compute_diagnostic_score=compute_diagnostic_score, modality=modality, dataset_name=dataset_name, ) @@ -414,13 +462,16 @@ def _run_job(args): metadata=metadata, metrics=metrics, compute_quality_score=compute_quality_score, + compute_diagnostic_score=compute_diagnostic_score, modality=modality, dataset_name=dataset_name, ) except Exception as error: output['exception'] = error - scores = _format_output(output, name, dataset_name, compute_quality_score, cache_dir) + scores = _format_output( + output, name, dataset_name, compute_quality_score, compute_diagnostic_score, cache_dir + ) return scores @@ -482,7 +533,7 @@ def _run_jobs(multi_processing_config, job_args_list, show_progress): return scores -def _get_empty_dataframe(compute_quality_score, sdmetrics): +def _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics): warnings.warn('No datasets/synthesizers found.') scores = pd.DataFrame({ @@ -496,6 +547,8 @@ def _get_empty_dataframe(compute_quality_score, sdmetrics): 'Evaluate_Time': [], }) + if compute_diagnostic_score: + scores['Diagnostic_Score'] = [] if compute_quality_score: scores['Quality_Score'] = [] if sdmetrics: @@ -564,7 +617,7 @@ def _create_sdgym_script(params, output_filepath): import sdgym from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer, GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer, - SDVTabularSynthesizer,TVAESynthesizer) + SDVTabularSynthesizer, TVAESynthesizer) results = sdgym.benchmark_single_table( {synthesizer_string}, custom_synthesizers={params['custom_synthesizers']}, @@ -572,6 +625,7 @@ def _create_sdgym_script(params, output_filepath): additional_datasets_folder={params['additional_datasets_folder']}, limit_dataset_size={params['limit_dataset_size']}, compute_quality_score={params['compute_quality_score']}, + compute_diagnostic_score={params['compute_diagnostic_score']}, sdmetrics={params['sdmetrics']}, timeout={params['timeout']}, detailed_results_folder={params['detailed_results_folder']}, multi_processing_config={params['multi_processing_config']} @@ -643,6 +697,7 @@ def benchmark_single_table( additional_datasets_folder=None, limit_dataset_size=False, compute_quality_score=True, + compute_diagnostic_score=True, sdmetrics=DEFAULT_METRICS, timeout=None, output_filepath=None, @@ -680,6 +735,8 @@ def benchmark_single_table( columns. compute_quality_score (bool): Whether or not to evaluate an overall quality score. + compute_diagnostic_score (bool): + Whether or not to evaluate an overall diagnostic score. sdmetrics (list[str]): A list of the different SDMetrics to use. If you'd like to input specific parameters into the metric, provide a tuple with the metric name followed by a dictionary of @@ -703,6 +760,11 @@ def benchmark_single_table( 'package_name': 'dask' or 'multiprocessing', 'num_workers': 4 } + run_on_ec2 (bool): + The flag is used to run the benchmark on an EC2 instance that will be created + by a scriptusing the authentication of the current user. The EC2 instance + uses the LATEST released version of sdgym. Local changes or changes NOT + in the released version will NOT be used in the ec2 instance. Returns: pandas.DataFrame: @@ -729,6 +791,7 @@ def benchmark_single_table( detailed_results_folder, timeout, compute_quality_score, + compute_diagnostic_score, synthesizers, custom_synthesizers, ) @@ -738,7 +801,7 @@ def benchmark_single_table( # If no synthesizers/datasets are passed, return an empty dataframe else: - scores = _get_empty_dataframe(compute_quality_score, sdmetrics) + scores = _get_empty_dataframe(compute_diagnostic_score, compute_quality_score, sdmetrics) if output_filepath: write_csv(scores, output_filepath, None, None) diff --git a/sdgym/cli/__main__.py b/sdgym/cli/__main__.py index ebe55df..715d7ca 100644 --- a/sdgym/cli/__main__.py +++ b/sdgym/cli/__main__.py @@ -41,13 +41,13 @@ def _print_table(data, sort=None, reverse=False, format=None): if 'error' in data: error = data['error'] - if pd.isnull(error).all(): + if pd.isna(error).all(): del data['error'] else: long_error = error.str.len() > 30 data.loc[long_error, 'error'] = error[long_error].str[:30] + '...' - print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) + print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) # noqa: T201 def _run(args): @@ -110,7 +110,7 @@ def _download_datasets(args): def _list_downloaded(args): datasets = sdgym.cli.utils.get_downloaded_datasets(args.datasets_path) _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size}) - print(f'Found {len(datasets)} downloaded datasets') + print(f'Found {len(datasets)} downloaded datasets') # noqa: T201 def _list_available(args): @@ -395,7 +395,7 @@ def main(): try: args.action(args) except sdgym.errors.SDGymError as error: - print(f'ERROR: {error}') + print(f'ERROR: {error}') # noqa: T201 if __name__ == '__main__': diff --git a/sdgym/cli/collect.py b/sdgym/cli/collect.py index a7cc6af..350fd29 100644 --- a/sdgym/cli/collect.py +++ b/sdgym/cli/collect.py @@ -22,7 +22,7 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None) If an ``aws_secret`` is provided, the given secret access key will be used to read from and/or write to any s3 paths. """ - print(f'Reading results from {input_path}') + print(f'Reading results from {input_path}') # noqa: T201 scores = read_csv_from_path(input_path, aws_key, aws_secret) scores = scores.drop_duplicates() @@ -31,5 +31,5 @@ def collect_results(input_path, output_file=None, aws_key=None, aws_secret=None) else: output = f'{input_path}/results.csv' - print(f'Storing results at {output}') + print(f'Storing results at {output}') # noqa: T201 write_csv(scores, output, aws_key, aws_secret) diff --git a/sdgym/cli/summary.py b/sdgym/cli/summary.py index c6355c1..06d872a 100644 --- a/sdgym/cli/summary.py +++ b/sdgym/cli/summary.py @@ -15,7 +15,7 @@ ) MODALITY_BASELINES = { - 'single-table': ['Uniform', 'Independent', 'CLBN', 'PrivBN'], + 'single-table': ['Uniform', 'Column', 'CLBN', 'PrivBN'], 'multi-table': ['Uniform', 'Independent'], 'timeseries': [], } @@ -46,7 +46,7 @@ def preprocess(data): def _coverage(data): total = len(data.Dataset.unique()) - scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum()) + scores = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum()) coverage_perc = scores / total coverage_str = scores.astype(str) + f' / {total}' return coverage_perc, coverage_str @@ -102,7 +102,7 @@ def summarize(data, baselines=(), datasets=None): no_identity = data[data.Synthesizer != 'DataIdentity'] coverage_perc, coverage_str = _coverage(data) - solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notnull().sum()) + solved = data.groupby('Synthesizer').apply(lambda x: x.Quality_Score.notna().sum()) results = { 'total': len(data.Dataset.unique()), @@ -127,7 +127,7 @@ def summarize(data, baselines=(), datasets=None): for _, error_column in KNOWN_ERRORS: results[error_column] = grouped[error_column].sum() - results['errors'] = grouped.error.apply(lambda x: x.notnull().sum()) + results['errors'] = grouped.error.apply(lambda x: x.notna().sum()) total_errors = results['errors'] results['metric_errors'] = results['total'] - results['solved'] - total_errors @@ -160,7 +160,7 @@ def errors_summary(data): """ if 'error' in data.columns: all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'}) - synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).unstack(level=0) + synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0) for synthesizer, errors in synthesizer_errors.items(): all_errors[synthesizer] = errors.fillna(0).astype(int) @@ -217,7 +217,7 @@ def _find_library(synthesizer): def _add_summary_libraries(summary_data): summary_data['library'] = summary_data.index.map(_find_library) - summary_data['library'].fillna('Other', inplace=True) + summary_data['library'] = summary_data['library'].fillna('Other') return summary_data @@ -240,7 +240,7 @@ def _add_summary(data, modality, baselines, writer): }, axis=1, ) - summary.drop(index='Identity', inplace=True, errors='ignore') + summary = summary.drop(index='Identity', errors='ignore') summary = _add_summary_libraries(summary) beat_baseline_headers = ['beat_' + b.lower() for b in baselines] diff --git a/sdgym/cli/utils.py b/sdgym/cli/utils.py index 59d5c7b..7734627 100644 --- a/sdgym/cli/utils.py +++ b/sdgym/cli/utils.py @@ -67,7 +67,7 @@ def read_csv_from_path(path, aws_key, aws_secret): All csv content within a path will be read and returned in a DataFrame. The path can be either local or an s3 directory. - args: + Args: path (str): The path to read from, which can be either local or an s3 path. aws_key (str): diff --git a/sdgym/synthesizers/__init__.py b/sdgym/synthesizers/__init__.py index 00e93fc..bad59de 100644 --- a/sdgym/synthesizers/__init__.py +++ b/sdgym/synthesizers/__init__.py @@ -8,7 +8,7 @@ create_single_table_synthesizer, ) from sdgym.synthesizers.identity import DataIdentity -from sdgym.synthesizers.independent import IndependentSynthesizer +from sdgym.synthesizers.column import ColumnSynthesizer from sdgym.synthesizers.sdv import ( CopulaGANSynthesizer, CTGANSynthesizer, @@ -23,7 +23,7 @@ __all__ = ( 'DataIdentity', - 'IndependentSynthesizer', + 'ColumnSynthesizer', 'CTGANSynthesizer', 'TVAESynthesizer', 'UniformSynthesizer', diff --git a/sdgym/synthesizers/independent.py b/sdgym/synthesizers/column.py similarity index 95% rename from sdgym/synthesizers/independent.py rename to sdgym/synthesizers/column.py index 1474b2d..020a373 100644 --- a/sdgym/synthesizers/independent.py +++ b/sdgym/synthesizers/column.py @@ -1,4 +1,4 @@ -"""IndependentSynthesizer module.""" +"""ColumnSynthesizer module.""" import pandas as pd from rdt.hyper_transformer import HyperTransformer @@ -7,7 +7,7 @@ from sdgym.synthesizers.base import BaselineSynthesizer -class IndependentSynthesizer(BaselineSynthesizer): +class ColumnSynthesizer(BaselineSynthesizer): """Synthesizer that learns each column independently. Categorical columns are sampled using empirical frequencies. diff --git a/static_code_analysis.txt b/static_code_analysis.txt index 1f41f71..05299d1 100644 --- a/static_code_analysis.txt +++ b/static_code_analysis.txt @@ -1,4 +1,4 @@ -Run started:2024-04-11 07:28:47.466638 +Run started:2024-06-07 15:50:29.720919 Test results: >> Issue: [B403:blacklist] Consider possible security implications associated with pickle module. @@ -15,40 +15,39 @@ Test results: Severity: Low Confidence: High CWE: CWE-703 (https://cwe.mitre.org/data/definitions/703.html) More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b101_assert_used.html - Location: ./sdgym/benchmark.py:122:4 -121 synthesizer = synthesizer_dict['synthesizer'] -122 assert issubclass( -123 synthesizer, BaselineSynthesizer), '`synthesizer` must be a synthesizer class' -124 + Location: ./sdgym/benchmark.py:137:4 +136 synthesizer = synthesizer_dict['synthesizer'] +137 assert issubclass(synthesizer, BaselineSynthesizer), '`synthesizer` must be a synthesizer class' +138 -------------------------------------------------- >> Issue: [B608:hardcoded_sql_expressions] Possible SQL injection vector through string-based query construction. Severity: Medium Confidence: Low CWE: CWE-89 (https://cwe.mitre.org/data/definitions/89.html) More Info: https://bandit.readthedocs.io/en/1.7.7/plugins/b608_hardcoded_sql_expressions.html - Location: ./sdgym/benchmark.py:525:23 -524 # User data script to install the library -525 user_data_script = f"""#!/bin/bash -526 sudo apt update -y -527 sudo apt install python3-pip -y -528 echo "======== Install Dependencies ============" -529 sudo pip3 install sdgym -530 sudo pip3 install anyio -531 pip3 list -532 sudo apt install awscli -y -533 aws configure set aws_access_key_id {credentials.access_key} -534 aws configure set aws_secret_access_key {credentials.secret_key} -535 aws configure set region {session.region_name} -536 echo "======== Write Script ===========" -537 sudo touch ~/sdgym_script.py -538 echo "{script_content}" > ~/sdgym_script.py -539 echo "======== Run Script ===========" -540 sudo python3 ~/sdgym_script.py -541 echo "======== Complete ===========" -542 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) -543 aws ec2 terminate-instances --instance-ids $INSTANCE_ID -544 """ -545 + Location: ./sdgym/benchmark.py:591:23 +590 # User data script to install the library +591 user_data_script = f"""#!/bin/bash +592 sudo apt update -y +593 sudo apt install python3-pip -y +594 echo "======== Install Dependencies ============" +595 sudo pip3 install sdgym +596 sudo pip3 install anyio +597 pip3 list +598 sudo apt install awscli -y +599 aws configure set aws_access_key_id {credentials.access_key} +600 aws configure set aws_secret_access_key {credentials.secret_key} +601 aws configure set region {session.region_name} +602 echo "======== Write Script ===========" +603 sudo touch ~/sdgym_script.py +604 echo "{script_content}" > ~/sdgym_script.py +605 echo "======== Run Script ===========" +606 sudo python3 ~/sdgym_script.py +607 echo "======== Complete ===========" +608 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) +609 aws ec2 terminate-instances --instance-ids $INSTANCE_ID +610 """ +611 -------------------------------------------------- >> Issue: [B404:blacklist] Consider possible security implications associated with the subprocess module. @@ -83,7 +82,7 @@ Test results: -------------------------------------------------- Code scanned: - Total lines of code: 2645 + Total lines of code: 2634 Total lines skipped (#nosec): 0 Total potential issues skipped due to specifically being disabled (e.g., #nosec BXXX): 0 diff --git a/tasks.py b/tasks.py index 2eee93a..7e0588b 100644 --- a/tasks.py +++ b/tasks.py @@ -109,7 +109,14 @@ def minimum(c): def lint(c): check_dependencies(c) c.run('ruff check .') - c.run('ruff format . --check') + c.run('ruff format --check --diff .') + + +@task +def fix_lint(c): + check_dependencies(c) + c.run('ruff check --fix .') + c.run('ruff format .') def remove_readonly(func, path, _): diff --git a/tests/integration/synthesizers/test_independent.py b/tests/integration/synthesizers/test_column.py similarity index 72% rename from tests/integration/synthesizers/test_independent.py rename to tests/integration/synthesizers/test_column.py index 4b80832..161e7f7 100644 --- a/tests/integration/synthesizers/test_independent.py +++ b/tests/integration/synthesizers/test_column.py @@ -1,12 +1,12 @@ -"""Module to test the IndependentSynthesizer.""" +"""Module to test the ColumnSynthesizer.""" import numpy as np import pandas as pd -from sdgym.synthesizers.independent import IndependentSynthesizer +from sdgym.synthesizers.column import ColumnSynthesizer -def test_independent_synthesizer(): +def test_column_synthesizer(): """Ensure all sdtypes can be sampled.""" # Setup n_samples = 10000 @@ -24,11 +24,11 @@ def test_independent_synthesizer(): 'date': date_values, }) - independent_synthesizer = IndependentSynthesizer() + column_synthesizer = ColumnSynthesizer() # Run - trained_synthesizer = independent_synthesizer.get_trained_synthesizer(data, {}) - samples = independent_synthesizer.sample_from_synthesizer(trained_synthesizer, n_samples) + trained_synthesizer = column_synthesizer.get_trained_synthesizer(data, {}) + samples = column_synthesizer.sample_from_synthesizer(trained_synthesizer, n_samples) # Assert assert samples['num'].between(-10, 10).all() diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index a2f76ac..1f2bf2c 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -20,10 +20,10 @@ def test_benchmark_single_table_basic_synthsizers(): - """Test it with DataIdentity, IndependentSynthesizer and UniformSynthesizer.""" + """Test it with DataIdentity, ColumnSynthesizer and UniformSynthesizer.""" # Run output = sdgym.benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], sdv_datasets=['student_placements'], ) @@ -36,7 +36,7 @@ def test_benchmark_single_table_basic_synthsizers(): assert [ 'DataIdentity', - 'IndependentSynthesizer', + 'ColumnSynthesizer', 'UniformSynthesizer', ] == scores.index.tolist() @@ -44,7 +44,7 @@ def test_benchmark_single_table_basic_synthsizers(): assert [ 'UniformSynthesizer', - 'IndependentSynthesizer', + 'ColumnSynthesizer', 'DataIdentity', ] == quality_scores.index.tolist() @@ -53,7 +53,7 @@ def test_benchmark_single_table_no_metrics(): """Test it without metrics.""" # Run output = sdgym.benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], sdv_datasets=['student_placements'], sdmetrics=[], ) @@ -64,7 +64,7 @@ def test_benchmark_single_table_no_metrics(): assert 'Sample_Time' in output # Expect no metric columns. - assert len(output.columns) == 9 + assert len(output.columns) == 10 def test_benchmarking_no_report_output(): @@ -75,7 +75,7 @@ def test_benchmarking_no_report_output(): # Run with contextlib.redirect_stderr(prints): sdgym.benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], sdv_datasets=['student_placements'], ) @@ -102,7 +102,7 @@ def test_benchmark_single_table_error_handling(): # Run output = sdgym.benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], custom_synthesizers=[erroring_synthesizer], sdv_datasets=['student_placements'], ) @@ -120,7 +120,7 @@ def test_benchmark_single_table_compute_quality_score(): """Test ``compute_quality_score=False`` works.""" # Run output = sdgym.benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], sdv_datasets=['student_placements'], compute_quality_score=False, ) @@ -132,6 +132,22 @@ def test_benchmark_single_table_compute_quality_score(): assert 'Quality_Score' not in output +def test_benchmark_single_table_compute_diagnostic_score(): + """Test ``compute_diagnostic_score=False`` works.""" + # Run + output = sdgym.benchmark_single_table( + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], + sdv_datasets=['student_placements'], + compute_diagnostic_score=False, + ) + + # Assert + assert not output.empty + assert 'Train_Time' in output + assert 'Sample_Time' in output + assert 'Diagnostic_Score' not in output + + def test_benchmark_single_table_duplicate_synthesizers(): """Test it raises an error when passed a duplicate synthesizer.""" # Setup @@ -185,7 +201,7 @@ def sample_from_synthesizer(synthesizer, n_samples): 'CopulaGANSynthesizer', 'GaussianCopulaSynthesizer', 'DataIdentity', - 'IndependentSynthesizer', + 'ColumnSynthesizer', 'UniformSynthesizer', 'CTGANSynthesizer', ], @@ -200,7 +216,7 @@ def sample_from_synthesizer(synthesizer, n_samples): 'CopulaGANSynthesizer', 'GaussianCopulaSynthesizer', 'DataIdentity', - 'IndependentSynthesizer', + 'ColumnSynthesizer', 'UniformSynthesizer', 'CTGANSynthesizer', 'Custom:TestSynthesizer', @@ -248,6 +264,7 @@ def test_benchmark_single_table_timeout(): 'Synthesizer_Size_MB': {0: None}, 'Sample_Time': {0: None}, 'Evaluate_Time': {0: None}, + 'Diagnostic_Score': {0: None}, 'Quality_Score': {0: None}, 'error': {0: 'Synthesizer Timeout'}, }) @@ -264,7 +281,7 @@ def test_benchmark_single_table_only_datasets(): scores = benchmark_single_table(sdv_datasets=['fake_companies']) # Assert - assert len(scores.columns) == 10 + assert len(scores.columns) == 11 assert list(scores['Synthesizer']) == ['GaussianCopulaSynthesizer', 'CTGANSynthesizer'] assert list(scores['Dataset']) == ['fake_companies'] * 2 assert [round(score, 5) for score in scores['Dataset_Size_MB']] == [0.00128] * 2 @@ -274,6 +291,7 @@ def test_benchmark_single_table_only_datasets(): assert scores['Sample_Time'].between(0, 1000).all() assert scores['Evaluate_Time'].between(0, 1000).all() assert scores['Quality_Score'].between(0.5, 1).all() + assert (scores['Diagnostic_Score'] == 1.0).all() assert list(scores['NewRowSynthesis']) == [1.0] * 2 @@ -292,12 +310,13 @@ def test_benchmark_single_table_synthesizers_none(): ) # Assert - assert scores.shape == (1, 10) + assert scores.shape == (1, 11) scores = scores.iloc[0] assert scores['Synthesizer'] == 'Variant:test_synth' assert scores['Dataset'] == 'fake_companies' assert round(scores['Dataset_Size_MB'], 5) == 0.00128 assert 0.5 < scores['Quality_Score'] < 1 + assert scores['Diagnostic_Score'] == 1.0 assert ( scores[ ['Train_Time', 'Peak_Memory_MB', 'Synthesizer_Size_MB', 'Sample_Time', 'Evaluate_Time'] @@ -325,6 +344,7 @@ def test_benchmark_single_table_no_synthesizers(): 'Synthesizer_Size_MB': [], 'Sample_Time': [], 'Evaluate_Time': [], + 'Diagnostic_Score': [], 'Quality_Score': [], 'NewRowSynthesis': [], }) @@ -349,6 +369,7 @@ def test_benchmark_single_table_no_datasets(): 'Synthesizer_Size_MB': [], 'Sample_Time': [], 'Evaluate_Time': [], + 'Diagnostic_Score': [], 'Quality_Score': [], 'NewRowSynthesis': [], }) @@ -363,6 +384,7 @@ def test_benchmark_single_table_no_synthesizers_with_parameters(): sdv_datasets=['fake_companies'], sdmetrics=[('a', {'params'}), ('b', {'more_params'})], compute_quality_score=False, + compute_diagnostic_score=False, ) # Assert diff --git a/tests/unit/synthesizers/test_independent.py b/tests/unit/synthesizers/test_column.py similarity index 53% rename from tests/unit/synthesizers/test_independent.py rename to tests/unit/synthesizers/test_column.py index db92cd4..88b9e15 100644 --- a/tests/unit/synthesizers/test_independent.py +++ b/tests/unit/synthesizers/test_column.py @@ -2,20 +2,20 @@ import pandas as pd -from sdgym.synthesizers import IndependentSynthesizer +from sdgym.synthesizers import ColumnSynthesizer -class TestIndependentSynthesizer: - @patch('sdgym.synthesizers.independent.GaussianMixture') +class TestColumnSynthesizer: + @patch('sdgym.synthesizers.column.GaussianMixture') def test__get_trained_synthesizer(self, gm_mock): """Expect that GaussianMixture is instantiated with 4 components.""" # Setup - independent = IndependentSynthesizer() - independent.length = 10 + column_synthesizer = ColumnSynthesizer() + column_synthesizer.length = 10 data = pd.DataFrame({'col1': [1, 2, 3, 4]}) # Run - independent._get_trained_synthesizer(data, Mock()) + column_synthesizer._get_trained_synthesizer(data, Mock()) # Assert gm_mock.assert_called_once_with(4) diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py index 543f4b4..bc01624 100644 --- a/tests/unit/test_benchmark.py +++ b/tests/unit/test_benchmark.py @@ -4,7 +4,12 @@ import pytest from sdgym import benchmark_single_table -from sdgym.benchmark import _check_write_permissions, _create_sdgym_script, _directory_exists +from sdgym.benchmark import ( + _check_write_permissions, + _create_sdgym_script, + _directory_exists, + _format_output, +) from sdgym.synthesizers import GaussianCopulaSynthesizer @@ -21,7 +26,7 @@ def test_output_file_exists(path_mock): match='test_output.csv already exists. Please provide a file that does not already exist.', ): benchmark_single_table( - synthesizers=['DataIdentity', 'IndependentSynthesizer', 'UniformSynthesizer'], + synthesizers=['DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer'], sdv_datasets=['student_placements'], output_filepath=output_filepath, ) @@ -76,6 +81,7 @@ def test_benchmark_single_table_with_timeout(mock_multiprocessing, mock__score): 'Synthesizer_Size_MB': {0: None}, 'Sample_Time': {0: None}, 'Evaluate_Time': {0: None}, + 'Diagnostic_Score': {0: None}, 'Quality_Score': {0: None}, 'error': {0: 'Synthesizer Timeout'}, }) @@ -208,6 +214,7 @@ def test__create_sdgym_script(session_mock, mock_write_permissions, mock_directo ], 'limit_dataset_size': True, 'compute_quality_score': False, + 'compute_diagnostic_score': False, 'sdmetrics': [('NewRowSynthesis', {'synthetic_sample_size': 1000})], 'timeout': 600, 'output_filepath': 's3://sdgym-results/address_comments.csv', @@ -231,4 +238,60 @@ def test__create_sdgym_script(session_mock, mock_write_permissions, mock_directo assert "sdmetrics=[('NewRowSynthesis', {'synthetic_sample_size': 1000})]" in result assert 'timeout=600' in result assert 'compute_quality_score=False' in result + assert 'compute_diagnostic_score=False' in result assert 'import boto3' in result + + +def test__format_output(): + """Test the method ``_format_output`` and confirm that metrics are properly computed.""" + # Setup + mock_dataframe = pd.DataFrame([]) + mock_output = { + 'timeout': False, + 'dataset_size': 3.907452, + 'synthetic_data': mock_dataframe, + 'train_time': 267.028721, + 'sample_time': 1.039627, + 'synthesizer_size': 0.936981, + 'peak_memory': 127.729832, + 'diagnostic_score': 1.0, + 'quality_score': 0.881, + 'quality_score_time': 1.0, + 'diagnostic_score_time': 3.0, + 'scores': [ + { + 'metric': 'NewRowSynthesis', + 'error': None, + 'score': 0.998, + 'normalized_score': 0.998, + 'metric_time': 6.0, + }, + { + 'metric': 'NewMetric', + 'error': None, + 'score': 0.998, + 'normalized_score': 0.998, + 'metric_time': 5.0, + }, + ], + } + + # Run + scores = _format_output(mock_output, 'mock_name', 'mock_dataset', True, True, False) + + # Assert + expected_scores = pd.DataFrame({ + 'Synthesizer': ['mock_name'], + 'Dataset': ['mock_dataset'], + 'Dataset_Size_MB': [mock_output.get('dataset_size')], + 'Train_Time': [mock_output.get('train_time')], + 'Peak_Memory_MB': [mock_output.get('peak_memory')], + 'Synthesizer_Size_MB': [mock_output.get('synthesizer_size')], + 'Sample_Time': [mock_output.get('sample_time')], + 'Evaluate_Time': [15.0], + 'Diagnostic_Score': [1.0], + 'Quality_Score': [0.881], + 'NewRowSynthesis': [0.998], + 'NewMetric': [0.998], + }) + pd.testing.assert_frame_equal(scores, expected_scores) diff --git a/tests/unit/test_summary.py b/tests/unit/test_summary.py index 25abdea..b34d6fd 100644 --- a/tests/unit/test_summary.py +++ b/tests/unit/test_summary.py @@ -52,7 +52,7 @@ def test_make_summary_spreadsheet( 'second_best_time': [0, 1], 'third_best_time': [0, 0], 'beat_uniform': [2, 1], - 'beat_independent': [2, 1], + 'beat_column': [2, 1], 'beat_clbn': [2, 1], 'beat_privbn': [2, 1], 'timeout': [0, 1], @@ -90,7 +90,7 @@ def test_make_summary_spreadsheet( 'solved': [2, 1], 'best': [2, 0], 'beat_uniform': [2, 1], - 'beat_independent': [2, 1], + 'beat_column': [2, 1], 'beat_clbn': [2, 1], 'beat_privbn': [2, 1], },