Skip to content

Commit

Permalink
more efficient categorical variable scanning
Browse files Browse the repository at this point in the history
  • Loading branch information
rishsriv committed Jan 19, 2024
1 parent 29440f1 commit 67d7f0c
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
8 changes: 4 additions & 4 deletions defog/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def identify_categorical_columns(
f"Identifying categorical columns in {table_name}. This might take a while if you have many rows in your table."
)
for idx, row in enumerate(rows):
if row["data_type"].lower() in [
if row["data_type"].lower().strip() in [
"character varying",
"text",
"character",
Expand All @@ -87,9 +87,9 @@ def identify_categorical_columns(
]:
# get the total number of rows and number of distinct values in the table for this column
cur.execute(
f"SELECT COUNT({row['column_name']}) as tot_count, COUNT(DISTINCT {row['column_name']}) AS unique_count FROM {table_name};"
f"SELECT COUNT(DISTINCT {row['column_name']}) AS unique_count FROM {table_name};"
)
total_rows, num_distinct_values = cur.fetchone()
num_distinct_values = cur.fetchone()[0]

if num_distinct_values <= 10:
# get the top 10 distinct values
Expand All @@ -98,7 +98,7 @@ def identify_categorical_columns(
)
top_values = cur.fetchall()
top_values = [i[0] for i in top_values if i[0] is not None]
rows[idx]["top_values"] = top_values
rows[idx]["top_values"] = ",".join(top_values)
print(
f"Identified {row['column_name']} as a likely categorical column. The unique values are: {top_values}"
)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def package_files(directory):
name="defog",
packages=find_packages(),
package_data={"defog": ["gcp/*", "aws/*"] + next_static_files},
version="0.57.0",
version="0.57.1",
description="Defog is a Python library that helps you generate data queries from natural language questions.",
author="Full Stack Data Pte. Ltd.",
license="MIT",
Expand Down

0 comments on commit 67d7f0c

Please sign in to comment.