-
Notifications
You must be signed in to change notification settings - Fork 28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix column_name and null generation #337
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
"""Contains a random Null generator.""" | ||
import numpy as np | ||
|
||
|
||
def null_generation(num_rows: int = 1) -> np.array: | ||
""" | ||
Randomly generates an array of integers between the given min and max values. | ||
|
||
:param num_rows: the number of rows in np array generated | ||
:type num_rows: int, optional | ||
|
||
:return: np array of null values | ||
""" | ||
return np.array([None] * num_rows) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
from synthetic_data.distinct_generators.datetime_generator import random_datetimes | ||
from synthetic_data.distinct_generators.float_generator import random_floats | ||
from synthetic_data.distinct_generators.int_generator import random_integers | ||
from synthetic_data.distinct_generators.null_generator import null_generation | ||
from synthetic_data.distinct_generators.text_generator import random_text | ||
from synthetic_data.graph_synthetic_data import GraphDataGenerator | ||
from synthetic_data.synthetic_data import make_data_from_report | ||
|
@@ -42,6 +43,7 @@ def __init__( | |
"datetime": random_datetimes, | ||
"string": random_text, | ||
"text": random_text, | ||
"null_generator": null_generation, | ||
} | ||
|
||
@classmethod | ||
|
@@ -105,49 +107,46 @@ def _generate_uncorrelated_column_data(self, num_samples): | |
col_ = copy.deepcopy(col) | ||
|
||
generator_name = col_.get("data_type", None) | ||
|
||
if not generator_name: | ||
logging.warning( | ||
f"Generator of type {generator_name} is not implemented." | ||
) | ||
continue | ||
column_header = col_.get("column_name", None) | ||
|
||
col_["rng"] = self.rng | ||
col_["num_rows"] = num_samples | ||
if generator_name: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is needed in place of the removed warning, as we account for a fully nulled column |
||
if generator_name in ["string", "text"]: | ||
if col_.get("categorical", False): | ||
generator_name = "categorical" | ||
total = 0 | ||
for count in col["statistics"]["categorical_count"].values(): | ||
total += count | ||
|
||
if generator_name in ["string", "text"]: | ||
if col_.get("categorical", False): | ||
generator_name = "categorical" | ||
total = 0 | ||
for count in col["statistics"]["categorical_count"].values(): | ||
total += count | ||
|
||
probabilities = [] | ||
for count in col["statistics"]["categorical_count"].values(): | ||
probabilities.append(count / total) | ||
probabilities = [] | ||
for count in col["statistics"]["categorical_count"].values(): | ||
probabilities.append(count / total) | ||
|
||
col_["probabilities"] = probabilities | ||
col_["categories"] = col_["statistics"].get("categories", None) | ||
col_["probabilities"] = probabilities | ||
col_["categories"] = col_["statistics"].get("categories", None) | ||
|
||
col_["vocab"] = col_["statistics"].get("vocab", None) | ||
col_["vocab"] = col_["statistics"].get("vocab", None) | ||
|
||
col_["min"] = col_["statistics"].get("min", None) | ||
col_["max"] = col_["statistics"].get("max", None) | ||
col_["min"] = col_["statistics"].get("min", None) | ||
col_["max"] = col_["statistics"].get("max", None) | ||
|
||
# edge cases for extracting data from profiler report. | ||
if generator_name == "datetime": | ||
col_["format"] = col_["statistics"].get("format", None) | ||
col_["min"] = pd.to_datetime( | ||
col_["statistics"].get("min", None), format=col_["format"][0] | ||
) | ||
col_["max"] = pd.to_datetime( | ||
col_["statistics"].get("max", None), format=col_["format"][0] | ||
) | ||
# edge cases for extracting data from profiler report. | ||
if generator_name == "datetime": | ||
col_["format"] = col_["statistics"].get("format", None) | ||
col_["min"] = pd.to_datetime( | ||
col_["statistics"].get("min", None), format=col_["format"][0] | ||
) | ||
col_["max"] = pd.to_datetime( | ||
col_["statistics"].get("max", None), format=col_["format"][0] | ||
) | ||
|
||
if generator_name == "float": | ||
col_["precision"] = int( | ||
col_["statistics"].get("precision", None).get("max", None) | ||
) | ||
if generator_name == "float": | ||
col_["precision"] = int( | ||
col_["statistics"].get("precision", None).get("max", None) | ||
) | ||
elif not generator_name: | ||
generator_name = "null_generator" | ||
|
||
generator_func = self.gen_funcs.get(generator_name, None) | ||
params_gen_funcs = inspect.signature(generator_func) | ||
|
@@ -157,7 +156,9 @@ def _generate_uncorrelated_column_data(self, num_samples): | |
param_build[param[0]] = col_[param[0]] | ||
|
||
generated_data = generator_func(**param_build) | ||
if col_["order"] in sorting_types: | ||
if (not generator_name == "null_generator") and col_[ | ||
"order" | ||
] in sorting_types: | ||
dataset.append( | ||
self.get_ordered_column( | ||
generated_data, | ||
|
@@ -166,7 +167,9 @@ def _generate_uncorrelated_column_data(self, num_samples): | |
) | ||
) | ||
else: | ||
if col_["order"] is not None: | ||
if (not generator_name == "null_generator") and col_[ | ||
"order" | ||
] is not None: | ||
Comment on lines
169
to
+172
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just an else here (only way it gets is if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The warning above is not needed anymore just because of the null columns now being accounted for. |
||
logging.warning( | ||
f"""{generator_name} is passed with sorting type of {col_["order"]}. | ||
Ascending and descending are the only supported options. | ||
|
@@ -178,7 +181,7 @@ def _generate_uncorrelated_column_data(self, num_samples): | |
else: | ||
dataset.append(generated_data) | ||
|
||
column_names.append(generator_name) | ||
column_names.append(column_header) | ||
|
||
return self.convert_data_to_df(dataset, column_names=column_names) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function is used to generate fully null columns