From c8cb6fe18837cefca6601f296eb22e52a509fec0 Mon Sep 17 00:00:00 2001 From: Rishabh Srivastava Date: Tue, 5 Dec 2023 16:37:40 +0800 Subject: [PATCH] Rishabh/dx fixes (#15) * persist generate query url in the connection params * by default, add the auto-generate google sheet to the defog servers when defog gen is called * formatting * changed default mode of uploading metadata to local CSVs * improve error message * formatting * updated the `gen` function in the CLI to generate a CSV by default * fixed update function to use CSV --- defog/__init__.py | 412 +++++++++++++++++++++++++--------------------- defog/cli.py | 57 ++++--- setup.py | 2 +- 3 files changed, 264 insertions(+), 207 deletions(-) diff --git a/defog/__init__.py b/defog/__init__.py index 8028452..47f4a3c 100644 --- a/defog/__init__.py +++ b/defog/__init__.py @@ -5,6 +5,7 @@ import pandas as pd from defog.query import execute_query from importlib.metadata import version +from io import StringIO try: __version__ = version("defog") @@ -43,8 +44,6 @@ def __init__( 4) config file present, no params -> read params from config file 5) config file present, some/all params -> ignore existing config file, save new params to config file """ - self.generate_query_url = generate_query_url - if base64creds != "": self.from_base64_creds(base64creds) return @@ -57,7 +56,7 @@ def __init__( self.api_key = api_key self.db_type = db_type self.db_creds = db_creds - data = {"api_key": api_key, "db_type": db_type, "db_creds": db_creds} + self.generate_query_url = generate_query_url # write to filepath and print confirmation if save_json: self.save_connection_json() @@ -72,6 +71,10 @@ def __init__( self.api_key = data["api_key"] self.db_type = data["db_type"] self.db_creds = data["db_creds"] + self.generate_query_url = data.get( + "generate_query_url", + "https://api.defog.ai/generate_query_chat", + ) print(f"Connection details read from {self.filepath}.") else: raise KeyError( @@ -84,6 +87,8 @@ def __init__( self.api_key = api_key if db_type != "": self.db_type = db_type + + self.generate_query_url = generate_query_url self.db_creds = db_creds self.check_db_creds(self.db_type, self.db_creds) if save_json: @@ -101,6 +106,7 @@ def save_connection_json(self): "api_key": self.api_key, "db_type": self.db_type, "db_creds": self.db_creds, + "generate_query_url": self.generate_query_url, }, f, indent=4, @@ -211,7 +217,9 @@ def check_db_suitability(self, gsheets_url=None, tables=None): print(message) return True - def generate_postgres_schema(self, tables: list, upload: bool = True) -> str: + def generate_postgres_schema( + self, tables: list, upload: bool = True, return_format: str = "gsheets" + ) -> str: # when upload is True, we send the schema to the defog servers and generate a Google Sheet # when its false, we return the schema as a dict try: @@ -280,34 +288,59 @@ def generate_postgres_schema(self, tables: list, upload: bool = True) -> str: conn.close() print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." + "Sending the schema to the defog servers and generating column descriptions. This might take up to 2 minutes..." ) if upload: # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_postgres_schema_gsheets", - json={ - "api_key": self.api_key, - "schemas": schemas, - "foreign_keys": foreign_keys, - "indexes": indexes, - }, - ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url + if return_format == "gsheets": + r = requests.post( + "https://api.defog.ai/get_postgres_schema_gsheets", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": foreign_keys, + "indexes": indexes, + }, + ) + resp = r.json() + if "sheet_url" in resp: + gsheet_url = resp["sheet_url"] + return gsheet_url + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") - print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + r = requests.post( + "https://api.defog.ai/get_schema_csv", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": foreign_keys, + "indexes": indexes, + }, ) + resp = r.json() + if "csv" in resp: + csv = resp["csv"] + pd.read_csv(StringIO(csv)).to_csv("defog_metadata.csv", index=False) + return "defog_metadata.csv" + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: return schemas - def generate_redshift_schema(self, tables: list, upload: bool = True) -> str: + def generate_redshift_schema( + self, tables: list, upload: bool = True, return_format: str = "gsheets" + ) -> str: # when upload is True, we send the schema to the defog servers and generate a Google Sheet # when its false, we return the schema as a dict try: @@ -387,33 +420,58 @@ def generate_redshift_schema(self, tables: list, upload: bool = True) -> str: if upload: print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." + "Sending the schema to the defog servers and generating column descriptions. This might take up to 2 minutes..." ) # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_postgres_schema_gsheets", - json={ - "api_key": self.api_key, - "schemas": schemas, - "foreign_keys": foreign_keys, - "indexes": indexes, - }, - ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url + if return_format == "gsheets": + r = requests.post( + "https://api.defog.ai/get_postgres_schema_gsheets", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": foreign_keys, + "indexes": indexes, + }, + ) + resp = r.json() + if "sheet_url" in resp: + gsheet_url = resp["sheet_url"] + return gsheet_url + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + ) else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") - print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + r = requests.post( + "https://api.defog.ai/get_schema_csv", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": foreign_keys, + "indexes": indexes, + }, ) + resp = r.json() + if "csv" in resp: + csv = resp["csv"] + pd.read_csv(StringIO(csv)).to_csv("defog_metadata.csv", index=False) + return "defog_metadata.csv" + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: return schemas - def generate_mysql_schema(self, tables: list, upload: bool = True) -> str: + def generate_mysql_schema( + self, tables: list, upload: bool = True, return_format: str = "gsheets" + ) -> str: try: import mysql.connector except: @@ -438,74 +496,54 @@ def generate_mysql_schema(self, tables: list, upload: bool = True) -> str: conn.close() if upload: - print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." - ) - # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_postgres_schema_gsheets", - json={"api_key": self.api_key, "schemas": schemas}, - ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url - else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") + if return_format == "gsheets": print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." ) - else: - return schemas - - def generate_sqlserver_schema(self, tables: list, upload: bool = True) -> str: - try: - import pyodbc - except: - raise Exception("pyodbc not installed.") - - conn = pyodbc.connect(self.db_creds["connection_string"]) - cur = conn.cursor() - schemas = {} - - print("Getting schema for each table in your database...") - # get the schema for each table - for table_name in tables: - cur.execute( - f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{table_name}';" - ) - rows = cur.fetchall() - rows = [row for row in rows] - rows = [{"column_name": i[0], "data_type": i[1]} for i in rows] - schemas[table_name] = rows - - conn.close() - if upload: - print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." - ) - # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_postgres_schema_gsheets", - json={"api_key": self.api_key, "schemas": schemas}, - ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url + # send the schemas dict to the defog servers + r = requests.post( + "https://api.defog.ai/get_postgres_schema_gsheets", + json={"api_key": self.api_key, "schemas": schemas}, + ) + resp = r.json() + if "sheet_url" in resp: + gsheet_url = resp["sheet_url"] + return gsheet_url + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + ) else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") - print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + r = requests.post( + "https://api.defog.ai/get_schema_csv", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": [], + "indexes": [], + }, ) + resp = r.json() + if "csv" in resp: + csv = resp["csv"] + pd.read_csv(StringIO(csv)).to_csv("defog_metadata.csv", index=False) + return "defog_metadata.csv" + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: return schemas - def generate_snowflake_schema(self, tables: list, upload: bool = True) -> str: + def generate_snowflake_schema( + self, tables: list, upload: bool = True, return_format: str = "gsheets" + ) -> str: try: import snowflake.connector except: @@ -546,67 +584,53 @@ def generate_snowflake_schema(self, tables: list, upload: bool = True) -> str: if upload: print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." - ) - # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_postgres_schema_gsheets", - json={"api_key": self.api_key, "schemas": schemas}, + "Sending the schema to the defog servers and generating column descriptions. This might take up to 2 minutes..." ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url + if return_format == "gsheets": + # send the schemas dict to the defog servers + r = requests.post( + "https://api.defog.ai/get_postgres_schema_gsheets", + json={"api_key": self.api_key, "schemas": schemas}, + ) + resp = r.json() + if "sheet_url" in resp: + gsheet_url = resp["sheet_url"] + return gsheet_url + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + ) else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") - print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + r = requests.post( + "https://api.defog.ai/get_schema_csv", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": [], + "indexes": [], + }, ) + resp = r.json() + if "csv" in resp: + csv = resp["csv"] + pd.read_csv(StringIO(csv)).to_csv("defog_metadata.csv", index=False) + return "defog_metadata.csv" + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: return schemas - def generate_mongo_schema(self, collections: list) -> str: - try: - from pymongo import MongoClient - except: - raise Exception("pymongo not installed.") - - client = MongoClient(self.db_creds["connection_string"]) - db = client.get_database() - - schemas = {} - - print("Getting schema for each collection in your database...") - # get the schema for each table - for collection_name in collections: - collection = db[collection_name] - rows = collection.find_one() - rows = [ - {"field_name": i, "data_type": type(rows[i]).__name__} for i in rows - ] - schemas[collection_name] = rows - - client.close() - - print( - "Sending the schema to the defog servers and generating a Google Sheet. This might take up to 2 minutes..." - ) - # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_mongo_schema_gsheets", - json={"api_key": self.api_key, "schemas": schemas}, - ) - resp = r.json() - try: - gsheet_url = resp["sheet_url"] - return gsheet_url - except Exception as e: - print(f"We got the following error: {resp['message']}") - print(f"Please feel free to email support@defog.ai") - - def generate_bigquery_schema(self, tables: list, upload: bool = True) -> str: + def generate_bigquery_schema( + self, tables: list, upload: bool = True, return_format: str = "gsheets" + ) -> str: try: from google.cloud import bigquery except: @@ -629,43 +653,61 @@ def generate_bigquery_schema(self, tables: list, upload: bool = True) -> str: if upload: print( - "Sending the schema to Defog servers and generating a Google Sheet. This might take up to 2 minutes..." - ) - print(schemas) - # send the schemas dict to the defog servers - r = requests.post( - "https://api.defog.ai/get_bigquery_schema_gsheets", - json={"api_key": self.api_key, "schemas": schemas}, + "Sending the schema to Defog servers and generating column descriptions. This might take up to 2 minutes..." ) - resp = r.json() - if "sheet_url" in resp: - gsheet_url = resp["sheet_url"] - return gsheet_url + if return_format == "gsheets": + # send the schemas dict to the defog servers + r = requests.post( + "https://api.defog.ai/get_bigquery_schema_gsheets", + json={"api_key": self.api_key, "schemas": schemas}, + ) + resp = r.json() + if "sheet_url" in resp: + gsheet_url = resp["sheet_url"] + return gsheet_url + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + ) else: - print(f"We got an error!") - if "message" in resp: - print(f"Error message: {resp['message']}") - print( - f"Please feel free to open a github issue if this a generic library issue, or email support@defog.ai if you need dedicated customer-specific support." + r = requests.post( + "https://api.defog.ai/get_schema_csv", + json={ + "api_key": self.api_key, + "schemas": schemas, + "foreign_keys": [], + "indexes": [], + }, ) + resp = r.json() + if "csv" in resp: + csv = resp["csv"] + pd.read_csv(StringIO(csv)).to_csv("defog_metadata.csv", index=False) + return "defog_metadata.csv" + else: + print(f"We got an error!") + if "message" in resp: + print(f"Error message: {resp['message']}") + print( + f"Please feel free to open a github issue at https://github.com/defog-ai/defog-python if this a generic library issue, or email support@defog.ai." + ) else: return schemas def generate_db_schema(self, tables: list) -> str: if self.db_type == "postgres": - return self.generate_postgres_schema(tables) + return self.generate_postgres_schema(tables, return_format="csv") elif self.db_type == "mysql": - return self.generate_mysql_schema(tables) - elif self.db_type == "mongo": - return self.generate_mongo_schema(tables) + return self.generate_mysql_schema(tables, return_format="csv") elif self.db_type == "bigquery": - return self.generate_bigquery_schema(tables) + return self.generate_bigquery_schema(tables, return_format="csv") elif self.db_type == "redshift": - return self.generate_redshift_schema(tables) + return self.generate_redshift_schema(tables, return_format="csv") elif self.db_type == "snowflake": - return self.generate_snowflake_schema(tables) - elif self.db_type == "sqlserver": - return self.generate_sqlserver_schema(tables) + return self.generate_snowflake_schema(tables, return_format="csv") else: raise ValueError( f"Creation of a DB schema for {self.db_type} is not yet supported via the library. If you are a premium user, please contact us at founder@defog.ai so we can manually add it." diff --git a/defog/cli.py b/defog/cli.py index 4b27872..52191d8 100644 --- a/defog/cli.py +++ b/defog/cli.py @@ -190,19 +190,21 @@ def init(): sys.exit(0) else: df = defog.Defog(api_key=api_key, db_type=db_type, db_creds=db_creds) - gsheets_url = df.generate_db_schema(table_name_list) - print("Your schema has been generated and is available at:\n") - print(f"\033[1m{gsheets_url}\033[0m\n") + filename = df.generate_db_schema(table_name_list) + print( + "Your schema has been generated and is available as a CSV file in this folder at:\n" + ) + print(f"\033[1m{filename}\033[0m\n") print( - "You can give us more context about your schema at the above link. Once you're done, you can just hit enter to upload the data in this URL to Defog. If you would like to exit instead, just enter `exit`." + "You can give us more context about your schema by editing the CSV above. Once you're done, you can just hit enter to upload the data in the spreadsheet to Defog. If you would like to exit instead, just enter `exit`." ) upload_option = prompt() if upload_option == "exit": print("Exiting.") sys.exit(0) else: - resp = df.update_db_schema(gsheets_url) + resp = df.update_db_schema_csv(filename) if resp["status"] == "success": print("Your schema has been updated. You're ready to start querying!") else: @@ -255,11 +257,17 @@ def gen(): table_name_list = re.split(r"\s+", table_names.strip()) else: table_name_list = sys.argv[2:] - gsheets_url = df.generate_db_schema(table_name_list) - print("Your schema has been generated and is available at:\n") - print(f"\033[1m{gsheets_url}\033[0m\n") + filename = df.generate_db_schema(table_name_list) print( - "If you do modify the schema in the link provided, please run `defog update ` to update the updated schema." + "Your schema has been generated and is available at the following CSV file in this folder:\n" + ) + print(f"\033[1m{filename}\033[0m\n") + + print("We are now uploading this auto-generated schema to Defog.") + df.update_db_schema_csv(filename) + + print( + "If you modify the auto-generated schema, please run `defog update ` again to refresh the schema on Defog's servers." ) @@ -270,15 +278,15 @@ def update(): # check for 3rd arg (url), if not there, prompt user for url if len(sys.argv) < 3: print( - "defog update requires a google sheets url. Please enter the url of the google sheets document you would like to update:" + "defog update requires a CSV that contains your Database metadata. Please enter the path to the CSV you would like to update:" ) - gsheets_url = prompt() + filename = prompt() else: - gsheets_url = sys.argv[2] + filename = sys.argv[2] # load config from .defog/connection.json df = defog.Defog() # upload schema to defog - resp = df.update_db_schema(gsheets_url) + resp = df.update_db_schema_csv(filename) if resp["status"] == "success": print("Your schema has been updated. You're ready to start querying!") else: @@ -363,16 +371,23 @@ def query(): user_question = query resp = df.run_query(query, retries=3) if not resp["ran_successfully"]: - print("Defog generated the following query to answer your question:\n") - print(f"\033[1m{resp['query_generated']}\033[0m\n") + if "query_generated" in resp: + print( + "Defog generated the following query to answer your question:\n" + ) + print(f"\033[1m{resp['query_generated']}\033[0m\n") - print( - f"However, your query did not run successfully. The error message generated while running the query on your database was\n\n\033[1m{resp['error_message']}\033[0m\n." - ) + print( + f"However, your query did not run successfully. The error message generated while running the query on your database was\n\n\033[1m{resp['error_message']}\033[0m\n." + ) - print( - f"If you continue to get these errors, please consider updating the metadata in your schema by editing the google sheet generated and running `defog update `, or by updating your glossary.\n" - ) + print( + f"If you continue to get these errors, please consider updating the metadata in your schema by editing the google sheet generated and running `defog update `, or by updating your glossary.\n" + ) + else: + print( + f"Defog was unable to generate a query for your question. The error message generated while running the query on your database was\n\n\033[1m{resp.get('error_message')}\033[0m\n." + ) query = prompt("Enter another query, or type 'e' to exit: ") else: sql_generated = resp.get("query_generated") diff --git a/setup.py b/setup.py index a7c8c48..14ea647 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ name="defog", packages=find_packages(), package_data={"defog": ["gcp/*", "aws/*"]}, - version="0.52.0", + version="0.52.2", description="Defog is a Python library that helps you generate data queries from natural language questions.", author="Full Stack Data Pte. Ltd.", license="MIT",