Skip to content

Commit

Permalink
added string as a a data type for check categorical columns for, and …
Browse files Browse the repository at this point in the history
…better outputs in the CLI
  • Loading branch information
rishsriv committed Jan 12, 2024
1 parent 954d7c6 commit 408ec3d
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
22 changes: 16 additions & 6 deletions defog/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ def init():
print(
"Do you want to automatically scan these tables to determine which column might be categorical? The distinct values in each categorical column will be sent to our server. (y/n)"
)

scan_option = prompt().strip()
if scan_option.lower() == "y" or scan_option.lower() == "yes":
scan = True
Expand Down Expand Up @@ -280,18 +279,29 @@ def gen():
print(
"defog gen requires a list of tables to generate. Please enter the names of the tables whose schema you would like to generate, separated by a space:"
)
print(
"If you would like to index all of your tables, just leave this blank and hit enter (Supported for postgres + redshift only)."
)
table_names = prompt().strip()
table_name_list = re.split(r"\s+", table_names.strip())
else:
table_name_list = sys.argv[2:]
filename = df.generate_db_schema(table_name_list)

if table_name_list == [""] or table_name_list == []:
print("No tables were registered. Exiting.")
sys.exit(0)
print(
"Do you want to automatically scan these tables to determine which column might be categorical? The distinct values in each column likely to be a categorical column will be sent to our server. (y/n)"
)
scan_option = prompt().strip()
if scan_option.lower() == "y" or scan_option.lower() == "yes":
scan = True
else:
scan = False

filename = df.generate_db_schema(table_name_list, scan=scan)
pwd = os.getcwd()
print(
"Your schema has been generated and is available at the following CSV file in this folder:\n"
)
print(f"\033[1m{filename}\033[0m\n")
print(f"\033[1m{pwd}/{filename}\033[0m\n")

print("We are now uploading this auto-generated schema to Defog.")
df.update_db_schema_csv(filename)
Expand Down
4 changes: 3 additions & 1 deletion defog/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@ def identify_categorical_columns(
# if it is a categorical variable, then we want to get the distinct values and their counts
# we will then send this to the defog servers so that we can generate a column description
# for each categorical variable
print(f"Identifying categorical columns in {table_name}...")
print(f"Identifying categorical columns in {table_name}. This might take a while if you have many rows in your table.")
for idx, row in enumerate(rows):
if row["data_type"].lower() in [
"character varying",
"text",
"character",
"varchar",
"char",
"string"
]:
# get the total number of rows and number of distinct values in the table for this column
cur.execute(
Expand All @@ -96,6 +97,7 @@ def identify_categorical_columns(
top_values = cur.fetchall()
top_values = [i[0] for i in top_values if i[0] is not None]
rows[idx]["top_values"] = top_values
print(f"Identified {row['column_name']} as a likely categorical column.")
return rows


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def package_files(directory):
name="defog",
packages=find_packages(),
package_data={"defog": ["gcp/*", "aws/*"] + next_static_files},
version="0.56.2",
version="0.56.3",
description="Defog is a Python library that helps you generate data queries from natural language questions.",
author="Full Stack Data Pte. Ltd.",
license="MIT",
Expand Down

0 comments on commit 408ec3d

Please sign in to comment.