From e1dcd8f8b0917a5a867791b3fe43f85587207ba7 Mon Sep 17 00:00:00 2001 From: "ksneab7@gmail.com" Date: Wed, 27 Sep 2023 11:43:52 -0400 Subject: [PATCH 1/4] fix for column name exclusion bug --- synthetic_data/generators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synthetic_data/generators.py b/synthetic_data/generators.py index af83e00b..d4da2417 100644 --- a/synthetic_data/generators.py +++ b/synthetic_data/generators.py @@ -105,7 +105,7 @@ def _generate_uncorrelated_column_data(self, num_samples): col_ = copy.deepcopy(col) generator_name = col_.get("data_type", None) - + column_header = col_.get("column_name", None) if not generator_name: logging.warning( f"Generator of type {generator_name} is not implemented." @@ -178,7 +178,7 @@ def _generate_uncorrelated_column_data(self, num_samples): else: dataset.append(generated_data) - column_names.append(generator_name) + column_names.append(column_header) return self.convert_data_to_df(dataset, column_names=column_names) From c1bfa3d6f3fdc6fcec4da709e8464297e78aad3b Mon Sep 17 00:00:00 2001 From: "ksneab7@gmail.com" Date: Wed, 27 Sep 2023 12:52:37 -0400 Subject: [PATCH 2/4] pytest fixes --- tests/test_generators.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tests/test_generators.py b/tests/test_generators.py index 2bb79cff..25b6a471 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -74,7 +74,23 @@ def test_synthesize_uncorrelated_output(self): np.testing.assert_array_equal( actual_synthetic_data.columns.values, np.array( - ["datetime", "categorical", "int", "string", "float"], dtype="object" + [ + "datetime", + "host", + "src", + "proto", + "srcport", + "destport", + "srcip", + "locale", + "localeabbr", + "postalcode", + "latitude", + "longitude", + "comment", + "int_col", + ], + dtype="object", ), ) @@ -308,6 +324,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): "data_stats": [ { "data_type": "int", + "column_name": "test_column_1", "order": "ascending", "statistics": { "min": 1.0, @@ -316,6 +333,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): }, { "data_type": "string", + "column_name": "test_column_2", "categorical": False, "order": "ascending", "statistics": { @@ -326,6 +344,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): }, { "data_type": "string", + "column_name": "test_column_3", "categorical": True, "order": "ascending", "statistics": { @@ -342,11 +361,13 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): }, { "data_type": "float", + "column_name": "test_column_4", "order": "ascending", "statistics": {"min": 2.11234, "max": 8.0, "precision": {"max": 6}}, }, { "data_type": "datetime", + "column_name": "test_column_5", "order": "ascending", "statistics": { "format": ["%Y-%m-%d"], @@ -384,7 +405,13 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): [4, "wqfed", "yellow", 7.775666, "2026-02-04"], [4, "wsde", "yellow", 7.818521, "2027-06-13"], ] - categories = ["int", "string", "categorical", "float", "datetime"] + categories = [ + "test_column_1", + "test_column_2", + "test_column_3", + "test_column_4", + "test_column_5", + ] expected_data = [dict(zip(categories, item)) for item in expected_array] expected_df = pd.DataFrame(expected_data) From 8d6550670c1c383b08b0c10065ea7321e96a637e Mon Sep 17 00:00:00 2001 From: ksneab7 <91956551+ksneab7@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:54:35 -0400 Subject: [PATCH 3/4] Update tests/test_generators.py Co-authored-by: Taylor Turner --- tests/test_generators.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_generators.py b/tests/test_generators.py index 25b6a471..2bdb653b 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -405,7 +405,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): [4, "wqfed", "yellow", 7.775666, "2026-02-04"], [4, "wsde", "yellow", 7.818521, "2027-06-13"], ] - categories = [ + expected_column_names = [ "test_column_1", "test_column_2", "test_column_3", @@ -413,7 +413,9 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): "test_column_5", ] - expected_data = [dict(zip(categories, item)) for item in expected_array] + expected_data = [ + dict(zip(expected_column_names, item)) for item in expected_array + ] expected_df = pd.DataFrame(expected_data) actual_df = generator.synthesize(20) From a2ab09bc88293492fb31471d92538a0a7448eaf9 Mon Sep 17 00:00:00 2001 From: "ksneab7@gmail.com" Date: Wed, 27 Sep 2023 13:29:11 -0400 Subject: [PATCH 4/4] added null generation --- .../distinct_generators/null_generator.py | 14 ++++ synthetic_data/generators.py | 73 ++++++++++--------- tests/test_generators.py | 49 +++++++------ 3 files changed, 77 insertions(+), 59 deletions(-) create mode 100644 synthetic_data/distinct_generators/null_generator.py diff --git a/synthetic_data/distinct_generators/null_generator.py b/synthetic_data/distinct_generators/null_generator.py new file mode 100644 index 00000000..098d5826 --- /dev/null +++ b/synthetic_data/distinct_generators/null_generator.py @@ -0,0 +1,14 @@ +"""Contains a random Null generator.""" +import numpy as np + + +def null_generation(num_rows: int = 1) -> np.array: + """ + Randomly generates an array of integers between the given min and max values. + + :param num_rows: the number of rows in np array generated + :type num_rows: int, optional + + :return: np array of null values + """ + return np.array([None] * num_rows) diff --git a/synthetic_data/generators.py b/synthetic_data/generators.py index d4da2417..95b4754c 100644 --- a/synthetic_data/generators.py +++ b/synthetic_data/generators.py @@ -15,6 +15,7 @@ from synthetic_data.distinct_generators.datetime_generator import random_datetimes from synthetic_data.distinct_generators.float_generator import random_floats from synthetic_data.distinct_generators.int_generator import random_integers +from synthetic_data.distinct_generators.null_generator import null_generation from synthetic_data.distinct_generators.text_generator import random_text from synthetic_data.graph_synthetic_data import GraphDataGenerator from synthetic_data.synthetic_data import make_data_from_report @@ -42,6 +43,7 @@ def __init__( "datetime": random_datetimes, "string": random_text, "text": random_text, + "null_generator": null_generation, } @classmethod @@ -106,48 +108,45 @@ def _generate_uncorrelated_column_data(self, num_samples): generator_name = col_.get("data_type", None) column_header = col_.get("column_name", None) - if not generator_name: - logging.warning( - f"Generator of type {generator_name} is not implemented." - ) - continue col_["rng"] = self.rng col_["num_rows"] = num_samples + if generator_name: + if generator_name in ["string", "text"]: + if col_.get("categorical", False): + generator_name = "categorical" + total = 0 + for count in col["statistics"]["categorical_count"].values(): + total += count - if generator_name in ["string", "text"]: - if col_.get("categorical", False): - generator_name = "categorical" - total = 0 - for count in col["statistics"]["categorical_count"].values(): - total += count - - probabilities = [] - for count in col["statistics"]["categorical_count"].values(): - probabilities.append(count / total) + probabilities = [] + for count in col["statistics"]["categorical_count"].values(): + probabilities.append(count / total) - col_["probabilities"] = probabilities - col_["categories"] = col_["statistics"].get("categories", None) + col_["probabilities"] = probabilities + col_["categories"] = col_["statistics"].get("categories", None) - col_["vocab"] = col_["statistics"].get("vocab", None) + col_["vocab"] = col_["statistics"].get("vocab", None) - col_["min"] = col_["statistics"].get("min", None) - col_["max"] = col_["statistics"].get("max", None) + col_["min"] = col_["statistics"].get("min", None) + col_["max"] = col_["statistics"].get("max", None) - # edge cases for extracting data from profiler report. - if generator_name == "datetime": - col_["format"] = col_["statistics"].get("format", None) - col_["min"] = pd.to_datetime( - col_["statistics"].get("min", None), format=col_["format"][0] - ) - col_["max"] = pd.to_datetime( - col_["statistics"].get("max", None), format=col_["format"][0] - ) + # edge cases for extracting data from profiler report. + if generator_name == "datetime": + col_["format"] = col_["statistics"].get("format", None) + col_["min"] = pd.to_datetime( + col_["statistics"].get("min", None), format=col_["format"][0] + ) + col_["max"] = pd.to_datetime( + col_["statistics"].get("max", None), format=col_["format"][0] + ) - if generator_name == "float": - col_["precision"] = int( - col_["statistics"].get("precision", None).get("max", None) - ) + if generator_name == "float": + col_["precision"] = int( + col_["statistics"].get("precision", None).get("max", None) + ) + elif not generator_name: + generator_name = "null_generator" generator_func = self.gen_funcs.get(generator_name, None) params_gen_funcs = inspect.signature(generator_func) @@ -157,7 +156,9 @@ def _generate_uncorrelated_column_data(self, num_samples): param_build[param[0]] = col_[param[0]] generated_data = generator_func(**param_build) - if col_["order"] in sorting_types: + if (not generator_name == "null_generator") and col_[ + "order" + ] in sorting_types: dataset.append( self.get_ordered_column( generated_data, @@ -166,7 +167,9 @@ def _generate_uncorrelated_column_data(self, num_samples): ) ) else: - if col_["order"] is not None: + if (not generator_name == "null_generator") and col_[ + "order" + ] is not None: logging.warning( f"""{generator_name} is passed with sorting type of {col_["order"]}. Ascending and descending are the only supported options. diff --git a/tests/test_generators.py b/tests/test_generators.py index 2bdb653b..75707f02 100644 --- a/tests/test_generators.py +++ b/tests/test_generators.py @@ -79,6 +79,7 @@ def test_synthesize_uncorrelated_output(self): "host", "src", "proto", + "type", "srcport", "destport", "srcip", @@ -87,6 +88,7 @@ def test_synthesize_uncorrelated_output(self): "postalcode", "latitude", "longitude", + "owner", "comment", "int_col", ], @@ -317,9 +319,8 @@ def test_generate_uncorrelated_column_data( else: self.assertEqual(call_args_list[key], expected_calls[j][key]) - @mock.patch("synthetic_data.generators.logging.warning") @mock.patch("dataprofiler.profilers.StructuredProfiler.report") - def test_get_ordered_column_integration(self, mock_report, mock_warning): + def test_get_ordered_column_integration(self, mock_report): mock_report.return_value = { "data_stats": [ { @@ -376,6 +377,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): }, }, { + "column_name": "test_column_6", "data_type": None, }, ] @@ -384,26 +386,26 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): self.assertFalse(generator.is_correlated) expected_array = [ - [1, "arif", "blue", 2.246061, "2003-06-02"], - [1, "daips", "blue", 2.628393, "2003-10-08"], - [1, "dree", "orange", 2.642511, "2006-02-17"], - [1, "drqs", "orange", 2.807119, "2006-11-18"], - [1, "dwdaa", "orange", 3.009102, "2008-12-07"], - [2, "fswfe", "orange", 3.061853, "2009-12-03"], - [2, "fwqe", "orange", 3.677692, "2013-02-24"], - [2, "ipdpd", "orange", 3.887541, "2013-08-18"], - [3, "pdis", "red", 4.24257, "2014-02-19"], - [3, "peii", "red", 4.355663, "2014-04-29"], - [3, "pepie", "red", 4.739156, "2017-12-13"], - [3, "qrdq", "red", 4.831716, "2018-02-03"], - [3, "qrps", "yellow", 5.062321, "2019-05-13"], - [3, "rrqp", "yellow", 5.82323, "2020-01-09"], - [4, "sasr", "yellow", 6.212038, "2021-12-29"], - [4, "sspwe", "yellow", 6.231978, "2022-01-25"], - [4, "sssi", "yellow", 6.365346, "2023-03-20"], - [4, "wpfsi", "yellow", 7.461754, "2023-10-23"], - [4, "wqfed", "yellow", 7.775666, "2026-02-04"], - [4, "wsde", "yellow", 7.818521, "2027-06-13"], + [1, "arif", "blue", 2.246061, "2003-06-02", None], + [1, "daips", "blue", 2.628393, "2003-10-08", None], + [1, "dree", "orange", 2.642511, "2006-02-17", None], + [1, "drqs", "orange", 2.807119, "2006-11-18", None], + [1, "dwdaa", "orange", 3.009102, "2008-12-07", None], + [2, "fswfe", "orange", 3.061853, "2009-12-03", None], + [2, "fwqe", "orange", 3.677692, "2013-02-24", None], + [2, "ipdpd", "orange", 3.887541, "2013-08-18", None], + [3, "pdis", "red", 4.24257, "2014-02-19", None], + [3, "peii", "red", 4.355663, "2014-04-29", None], + [3, "pepie", "red", 4.739156, "2017-12-13", None], + [3, "qrdq", "red", 4.831716, "2018-02-03", None], + [3, "qrps", "yellow", 5.062321, "2019-05-13", None], + [3, "rrqp", "yellow", 5.82323, "2020-01-09", None], + [4, "sasr", "yellow", 6.212038, "2021-12-29", None], + [4, "sspwe", "yellow", 6.231978, "2022-01-25", None], + [4, "sssi", "yellow", 6.365346, "2023-03-20", None], + [4, "wpfsi", "yellow", 7.461754, "2023-10-23", None], + [4, "wqfed", "yellow", 7.775666, "2026-02-04", None], + [4, "wsde", "yellow", 7.818521, "2027-06-13", None], ] expected_column_names = [ "test_column_1", @@ -411,6 +413,7 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): "test_column_3", "test_column_4", "test_column_5", + "test_column_6", ] expected_data = [ @@ -420,8 +423,6 @@ def test_get_ordered_column_integration(self, mock_report, mock_warning): actual_df = generator.synthesize(20) - self.assertEqual(mock_warning.call_count, 1) - mock_warning.assert_called_with(f"Generator of type None is not implemented.") pd.testing.assert_frame_equal(expected_df, actual_df) @mock.patch("dataprofiler.profilers.StructuredProfiler.report")