Skip to content

Commit

Permalink
Merge pull request #324 from drahc1R/tabular-integration
Browse files Browse the repository at this point in the history
Connected tabular generator with dataset_generator
  • Loading branch information
taylorfturner authored Aug 9, 2023
2 parents 8e396f5 + f289ffd commit 0cc79fa
Show file tree
Hide file tree
Showing 13 changed files with 832 additions and 540 deletions.
140 changes: 0 additions & 140 deletions synthetic_data/dataset_generator.py

This file was deleted.

20 changes: 10 additions & 10 deletions synthetic_data/distinct_generators/datetime_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,19 @@ def generate_datetime(

def random_datetimes(
rng: Generator,
date_format_list: Optional[str] = None,
start_date: pd.Timestamp = None,
end_date: pd.Timestamp = None,
format: Optional[str] = None,
min: pd.Timestamp = None,
max: pd.Timestamp = None,
num_rows: int = 1,
) -> np.array:
"""
Generate datetime given the random_state, date_format, and start/end dates.
Generate datetime given the random_state, format, and start/end dates.
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param date_format: the format that the generated datatime will follow,
:param format: the format that the generated datatime will follow,
defaults to None
:type date_format: str, None, optional
:type format: str, None, optional
:param start_date: the earliest date that datetimes can be generated at,
defaults to pd.Timestamp(1920, 1, 1)
:type start_date: pd.Timestamp, None, optional
Expand All @@ -67,13 +67,13 @@ def random_datetimes(
:rtype: numpy array
"""
date_list = [""] * num_rows
if not date_format_list:
date_format_list = ["%B %d %Y %H:%M:%S"]
if not format:
format = ["%B %d %Y %H:%M:%S"]

for i in range(num_rows):
date_format = rng.choice(date_format_list)
date_format = rng.choice(format)
datetime = generate_datetime(
rng, date_format=date_format, start_date=start_date, end_date=end_date
rng, date_format=date_format, start_date=min, end_date=max
)
date_list[i] = datetime

Expand Down
24 changes: 12 additions & 12 deletions synthetic_data/distinct_generators/float_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@

def random_floats(
rng: Generator,
min_value: int = -1e6,
max_value: int = 1e6,
sig_figs: int = 3,
min: int = -1e6,
max: int = 1e6,
precision: int = 3,
num_rows: int = 1,
) -> np.array:
"""
Randomly generates an array of floats between the given min and max values.
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param min_value: the minimum float that can be returned
:type min_value: int, optional
:param max_value: the maximum float that can be returned
:type max_value: int, optional
:param min: the minimum float that can be returned
:type min: int, optional
:param max: the maximum float that can be returned
:type max: int, optional
:param sig_figs: restricts float to a number of sig_figs after decimal
:type sig_figs: int, optional
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:return: np array of floats
"""
if sig_figs < 0:
raise ValueError("sig_figs should be greater than or equal to 0")
if not isinstance(sig_figs, int):
raise ValueError("sig_figs should be an int")
return np.around(rng.uniform(min_value, max_value, num_rows), sig_figs)
if precision < 0:
raise ValueError("precision should be greater than or equal to 0")
if not isinstance(precision, int):
raise ValueError("precision should be an int")
return np.around(rng.uniform(min, max, num_rows), precision)
16 changes: 10 additions & 6 deletions synthetic_data/distinct_generators/int_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,24 @@


def random_integers(
rng: Generator, min_value: int = -1e6, max_value: int = 1e6, num_rows: int = 1
rng: Generator, min: int = -1e6, max: int = 1e6, num_rows: int = 1
) -> np.array:
"""
Randomly generates an array of integers between the given min and max values.
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param min_value: the minimum integer that can be returned
:type min_value: int, optional
:param max_value: the maximum integer that can be returned
:type max_value: int, optional
:param min: the minimum integer that can be returned
:type min: int, optional
:param max: the maximum integer that can be returned
:type max: int, optional
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:return: np array of integers
"""
return rng.integers(min_value, max_value, (num_rows,))
# rng.integers has an exclusive max length.
# Need to ensure that the max of the data is n-1 the max param value.
max += 1

return rng.integers(min, max, (num_rows,))
31 changes: 18 additions & 13 deletions synthetic_data/distinct_generators/text_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,30 @@

def random_text(
rng: Generator,
chars: Optional[List[str]] = None,
str_len_min: int = 1,
str_len_max: int = 1000,
vocab: Optional[List[str]] = None,
min: int = 1,
max: int = 1000,
num_rows: int = 1,
) -> np.array:
"""
Randomly generates an array of text with lengths between the min and max values.
:param rng: the np rng object used to generate random values
:type rng: numpy Generator
:param chars: a list of values that are allowed in a string or None
:type chars: List[str], None
:param vocab: a list of values that are allowed in a string or None
:type vocab: List[str], None
:param num_rows: the number of rows in np array generated
:type num_rows: int, optional
:param str_len_min: the minimum length a string can be
:type str_len_min: int, optional
:param str_len_max: the maximum length a string can be
:type str_len_max: int (one above the max), optional
:param min: the minimum length a string can be
:type min: int, optional
:param max: the maximum length a string can be
:type max: int (one above the max), optional
:return: numpy array of strings
:rtype: numpy array
"""
if chars is None:
chars = list(
if vocab is None:
vocab = list(
string.ascii_uppercase
+ string.ascii_lowercase
+ string.digits
Expand All @@ -39,9 +40,13 @@ def random_text(
)
text_list = []

# rng.integers has an exclusive max length.
# Need to ensure that the max of the data is n-1 the max param value.
max += 1

for _ in range(num_rows):
length = rng.integers(str_len_min, str_len_max)
string_entry = "".join(rng.choice(chars, (length,)))
length = rng.integers(min, max)
string_entry = "".join(rng.choice(vocab, (length,)))
text_list.append(string_entry)

return np.array(text_list)
2 changes: 1 addition & 1 deletion synthetic_data/generator_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __new__(cls, seed=None, config=None, *args, **kwargs):

profile = kwargs.pop("profile", None)
data = kwargs.pop("data", None)
if not profile and not data:
if not profile and data is None:
raise ValueError(
"No profile object or dataset was passed in kwargs. "
"If you want to generate synthetic data from a "
Expand Down
Loading

0 comments on commit 0cc79fa

Please sign in to comment.