diff --git a/README.md b/README.md index 505d2f7..b64c09d 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ ## :seedling: Set up your AnyParser API key -To get started, generate your API key from the [Playground Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**. +To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**. > ⚠️ **Note:** The free API is limited to 10 pages/call. @@ -25,18 +25,18 @@ CAMBIO_API_KEY=0cam************************ ## :computer: Installation ### 1. Set Up a New Conda Environment and Install AnyParser -First, create and activate a new Conda environment, then install AnyParser: +First, create and activate a new Conda environment, then install AnyParser: ```bash conda create -n any-parse python=3.10 -y conda activate any-parse pip3 install any-parser ``` ### 2. Create an AnyParser Instance Using Your API Key -Use your API key to create an instance of AnyParserRT. Make sure you’ve set up your .env file to store your API key securely: +Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely: ```python import os from dotenv import load_dotenv -from any_parser import AnyParserRT # Import the AnyParserRT class +from any_parser import AnyParser # Load environment variables load_dotenv(override=True) @@ -45,7 +45,7 @@ load_dotenv(override=True) example_apikey = os.getenv("CAMBIO_API_KEY") # Create an AnyParser instance -ap = AnyParserRT(api_key=example_apikey) +ap = AnyParser(api_key=example_apikey) ``` ### 3. Run Synchronous Extraction @@ -73,4 +73,3 @@ Are you an AI engineer looking to **accurately** extract both the text and layou ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb) Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb). - diff --git a/any_parser/__init__.py b/any_parser/__init__.py index 0ab294c..2d4b271 100644 --- a/any_parser/__init__.py +++ b/any_parser/__init__.py @@ -4,4 +4,4 @@ __all__ = ["AnyParser", "ModelType"] -__version__ = "0.0.16" +__version__ = "0.0.17" diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 44a62e5..32c6b27 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -9,33 +9,15 @@ import requests +from any_parser.utils import ( + ModelType, + check_file_type_and_path, + check_model, + upload_file_to_presigned_url, +) + PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" TIMEOUT = 60 -SUPPORTED_FILE_EXTENSIONS = [ - "pdf", - "doc", - "docx", - "ppt", - "pptx", - "jpg", - "jpeg", - "png", - "gif", -] -RESULT_TYPES = ["markdown", "json"] -RESUME_EXTRACT_TYPES = [ - "education", - "work_experience", - "personal_info", - "skills", - "certifications", - "projects", -] - - -class ModelType(Enum): - BASE = "base" - PRO = "pro" class ProcessType(Enum): @@ -76,23 +58,28 @@ def extract( model: ModelType = ModelType.BASE, extract_args: Optional[Dict] = None, ) -> Tuple[str, str]: - """Extract data in real-time. + """Extract full content from a file in real-time. Args: file_path (str): The path to the file to be parsed. - extract_args (Optional[Dict]): Additional extraction arguments added to prompt + model (ModelType): The model to use for extraction. Can be + `ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`. + extract_args (Optional[Dict]): Additional extraction arguments added + to the prompt. + Returns: tuple(str, str): The extracted data and the time taken. """ + file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists and file_type - error = self._check_file_type_and_path(file_path, file_extension) + error = check_file_type_and_path(file_path, file_extension) if error: return error, None - error = self._check_model(model) + error = check_model(model) if error: return error, None @@ -145,24 +132,24 @@ def extract( else: return f"Error: {response.status_code} {response.text}", None - def extract_json( + def extract_key_value( self, file_path: str, extract_instruction: Dict, ) -> Tuple[str, str]: - """Extract json in real-time. + """Extract key-value pairs from a file in real-time. Args: file_path (str): The path to the file to be parsed. - extract_instruction (Dict): A dictionary containing the keys to be extracted, - with their values as the description of those keys. + extract_instruction (Dict): A dictionary containing the keys to be + extracted, with their values as the description of those keys. Returns: tuple(str, str): The extracted data and the time taken. """ file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists and file_type - error = self._check_file_type_and_path(file_path, file_extension) + error = check_file_type_and_path(file_path, file_extension) if error: return error, None @@ -274,10 +261,12 @@ def async_extract( model: ModelType = ModelType.BASE, extract_args: Optional[Dict] = None, ) -> str: - """Extract data asynchronously. + """Extract full content from a file asynchronously. Args: file_path (str): The path to the file to be parsed. + model (ModelType): The model to use for extraction. Can be + `ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`. extract_args (Optional[Dict]): Additional extraction arguments added to prompt Returns: str: The file id of the uploaded file. @@ -285,12 +274,12 @@ def async_extract( file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists and file_type - error = self._check_file_type_and_path(file_path, file_extension) + error = check_file_type_and_path(file_path, file_extension) if error: return error, None - error = self._check_model(model) + error = check_model(model) if error: return error, None @@ -321,26 +310,26 @@ def async_extract( ) # If response successful, upload the file - return self._upload_file_to_presigned_url(file_path, response) + return upload_file_to_presigned_url(file_path, response) - def async_extract_json( + def async_extract_key_value( self, file_path: str, extract_instruction: Dict, ) -> str: - """Extract data asynchronously. + """Extract key-value pairs from a file asynchronously. Args: file_path (str): The path to the file to be parsed. - extract_instruction (Dict): A dictionary containing the keys to be extracted, - with their values as the description of those keys. + extract_instruction (Dict): A dictionary containing the keys to be + extracted, with their values as the description of those keys. Returns: str: The file id of the uploaded file. """ file_extension = Path(file_path).suffix.lower().lstrip(".") # Check if the file exists and file_type - error = self._check_file_type_and_path(file_path, file_extension) + error = check_file_type_and_path(file_path, file_extension) if error: return error, None @@ -363,7 +352,7 @@ def async_extract_json( ) # If response successful, upload the file - return self._upload_file_to_presigned_url(file_path, response) + return upload_file_to_presigned_url(file_path, response) def async_fetch( self, @@ -371,7 +360,6 @@ def async_fetch( sync: bool = True, sync_timeout: int = 60, sync_interval: int = 5, - result_type: str = "markdown", ) -> str: """Fetches extraction results asynchronously. @@ -380,13 +368,11 @@ def async_fetch( sync (bool, optional): Whether to wait for the results synchronously. sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60. sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5. - result_type (string, optional): The type of result to fetch. Defaults to `markdown`. Returns: str: The extracted results as a markdown string. None: If the extraction is still in progress (when sync is False). """ - self._check_result_type(result_type) response = None # Create the JSON payload @@ -416,58 +402,13 @@ def async_fetch( if response is None: return "Error: timeout, no response received" if response.status_code == 200: - if result_type == "json": - return response.json()["json"] - else: - markdown_list = response.json()["markdown"] + result = response.json() + if "json" in result: + return result["json"] + elif "markdown" in result: + markdown_list = result["markdown"] return "\n".join(markdown_list) + return f"Error: Invalid response format\n {result}" if response.status_code == 202: return None return f"Error: {response.status_code} {response.text}" - - def _upload_file_to_presigned_url( - self, file_path: str, response: requests.Response - ) -> str: - if response.status_code == 200: - try: - file_id = response.json().get("fileId") - presigned_url = response.json().get("presignedUrl") - with open(file_path, "rb") as file_to_upload: - files = {"file": (file_path, file_to_upload)} - upload_resp = requests.post( - presigned_url["url"], - data=presigned_url["fields"], - files=files, - timeout=TIMEOUT, - ) - if upload_resp.status_code != 204: - return f"Error: {upload_resp.status_code} {upload_resp.text}" - return file_id - except json.JSONDecodeError: - return "Error: Invalid JSON response" - else: - return f"Error: {response.status_code} {response.text}" - - def _check_model(self, model: ModelType) -> None: - if model not in {ModelType.BASE, ModelType.PRO}: - valid_models = ", ".join(["`" + model.value + "`" for model in ModelType]) - return f"Invalid model type: {model}. Supported `model` types include {valid_models}." - - def _check_file_type_and_path(self, file_path, file_extension): - # Check if the file exists - if not Path(file_path).is_file(): - return f"Error: File does not exist: {file_path}" - - if file_extension not in SUPPORTED_FILE_EXTENSIONS: - supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) - return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." - - def _check_result_type(self, result_type: str) -> None: - if result_type not in RESULT_TYPES: - valid_result_types = ", ".join(RESULT_TYPES) - return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}." - - def _check_resume_extract_type(self, extract_type: str) -> None: - if extract_type not in RESUME_EXTRACT_TYPES: - valid_extract_types = ", ".join(RESUME_EXTRACT_TYPES) - return f"Invalid extract type: {extract_type}. Supported `extract_type` types include {valid_extract_types}." diff --git a/any_parser/utils.py b/any_parser/utils.py new file mode 100644 index 0000000..ed70fe1 --- /dev/null +++ b/any_parser/utils.py @@ -0,0 +1,63 @@ +import json +from enum import Enum +from pathlib import Path + +import requests + + +class ModelType(Enum): + BASE = "base" + PRO = "pro" + + +SUPPORTED_FILE_EXTENSIONS = [ + "pdf", + "doc", + "docx", + "ppt", + "pptx", + "jpg", + "jpeg", + "png", + "gif", +] + + +def upload_file_to_presigned_url( + file_path: str, response: requests.Response, timeout: int = 10 +) -> str: + if response.status_code == 200: + try: + file_id = response.json().get("fileId") + presigned_url = response.json().get("presignedUrl") + with open(file_path, "rb") as file_to_upload: + files = {"file": (file_path, file_to_upload)} + upload_resp = requests.post( + presigned_url["url"], + data=presigned_url["fields"], + files=files, + timeout=timeout, + ) + if upload_resp.status_code != 204: + return f"Error: {upload_resp.status_code} {upload_resp.text}" + return file_id + except json.JSONDecodeError: + return "Error: Invalid JSON response" + else: + return f"Error: {response.status_code} {response.text}" + + +def check_model(model: ModelType) -> None: + if model not in {ModelType.BASE, ModelType.PRO}: + valid_models = ", ".join(["`" + model.value + "`" for model in ModelType]) + return f"Invalid model type: {model}. Supported `model` types include {valid_models}." + + +def check_file_type_and_path(file_path, file_extension): + # Check if the file exists + if not Path(file_path).is_file(): + return f"Error: File does not exist: {file_path}" + + if file_extension not in SUPPORTED_FILE_EXTENSIONS: + supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) + return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." diff --git a/examples/async_pdf_to_json.ipynb b/examples/async_pdf_to_key_value.ipynb similarity index 99% rename from examples/async_pdf_to_json.ipynb rename to examples/async_pdf_to_key_value.ipynb index 4f4164d..f517912 100644 --- a/examples/async_pdf_to_json.ipynb +++ b/examples/async_pdf_to_key_value.ipynb @@ -6,7 +6,7 @@ "source": [ "# Extract Key-Value pairs from PDF\n", "\n", - "Below is an example of using AnyParser to accurately extract values from a sample PDF into JSON format.\n", + "Below is an example of using AnyParser to accurately extract values from a sample PDF into key-value pairs.\n", "\n", "To load the preview for the PDF document used in this demo, install the Poppler PDF rendering library:\n", "- Mac:\n", @@ -55,10 +55,10 @@ "# !pip3 install --upgrade any-parser\n", "\n", "# Option 2: if you have sdk respository installed locally, add system path\n", - "import sys\n", - "sys.path.append(\".\")\n", - "sys.path.append(\"..\")\n", - "sys.path.append(\"../..\")" + "# import sys\n", + "# sys.path.append(\".\")\n", + "# sys.path.append(\"..\")\n", + "# sys.path.append(\"../..\")" ] }, { @@ -156,15 +156,14 @@ "\n", "We need to specify 2 parameters for the `async_upload`:\n", "- `file_path`: The path to the PDF file.\n", - "- `extract_instruction`: This is a dictionary of the form `{\"key\": \"value\"}` where the key is the name of the key to extract and the value is a description of that key.\n", + "- `extract_instruction`: This is a dictionary of the form `{\"key\": \"key description\"}` where the key is the name of the key to extract and the key description is a description of that key.\n", "\n", "For the `async_fetch`, we need need to specify the following parameters:\n", "- `file_id`: The file ID returned from the `async_upload` function.\n", "- `sync_timeout`: The time in seconds to wait for the extraction to complete.\n", - "- `synce_interval`: The time in seconds to wait between each check for the extraction status.\n", - "- `result_type`: The type of result to extract. It defaults to `markdown`, so we'll need to set it to `json`.\n", + "- `sync_interval`: The time in seconds to wait between each check for the extraction status.\n", "\n", - "The extract may take 1-20 seconds per page. Note that this example uses the Synchronous API. To see how AnyParser can be used asynchronously, see the [Synchronous API notebook](./pdf_to_json.ipynb)." + "The extract may take 1-20 seconds per page. Note that this example uses the Synchronous API. To see how AnyParser can be used asynchronously, see the [Synchronous API notebook](./pdf_to_key_value.ipynb)." ] }, { @@ -199,10 +198,10 @@ " }\n", "\n", "# extract returns a tuple containing the markdown as a string and total time\n", - "file_id = ap.async_extract_json(example_local_file, extract_instruction)\n", + "file_id = ap.async_extract_key_value(example_local_file, extract_instruction)\n", "\n", "# fetch results (5s polling up to 60s)\n", - "result = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5, result_type=\"json\")\n", + "result = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5)\n", "display(result)" ] }, diff --git a/examples/pdf_to_json.ipynb b/examples/pdf_to_key_value.ipynb similarity index 99% rename from examples/pdf_to_json.ipynb rename to examples/pdf_to_key_value.ipynb index 1b51d9d..5ed420f 100644 --- a/examples/pdf_to_json.ipynb +++ b/examples/pdf_to_key_value.ipynb @@ -6,7 +6,7 @@ "source": [ "# Extract Key-Value pairs from PDF\n", "\n", - "Below is an example of using AnyParser to accurately extract values from a sample PDF into JSON format.\n", + "Below is an example of using AnyParser to accurately extract values from a sample PDF into key-value pairs.\n", "\n", "To load the preview for the PDF document used in this demo, install the Poppler PDF rendering library:\n", "- Mac:\n", @@ -55,10 +55,10 @@ "# !pip3 install --upgrade any-parser\n", "\n", "# Option 2: if you have sdk respository installed locally, add system path\n", - "import sys\n", - "sys.path.append(\".\")\n", - "sys.path.append(\"..\")\n", - "sys.path.append(\"../..\")" + "# import sys\n", + "# sys.path.append(\".\")\n", + "# sys.path.append(\"..\")\n", + "# sys.path.append(\"../..\")" ] }, { @@ -156,9 +156,9 @@ "\n", "We need to specify 2 parameters:\n", "- `file_path`: The path to the PDF file.\n", - "- `extract_instruction`: This is a dictionary of the form `{\"key\": \"value\"}` where the key is the name of the key to extract and the value is a description of that key.\n", + "- `extract_instruction`: This is a dictionary of the form `{\"key\": \"key description\"}` where the key is the name of the key to extract and the key description is a description of that key.\n", "\n", - "The extract may take 1-20 seconds per page. Note that this example uses the Synchronous API. To see how AnyParser can be used asynchronously, see the [Asynchronous API notebook](./async_pdf_to_json.ipynb)." + "The extract may take 1-20 seconds per page. Note that this example uses the Synchronous API. To see how AnyParser can be used asynchronously, see the [Asynchronous API notebook](./async_pdf_to_key_value.ipynb)." ] }, { @@ -179,7 +179,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Time Elapsed: 6.02 seconds\n" + "Time Elapsed: 4.89 seconds\n" ] } ], @@ -193,7 +193,7 @@ "\n", "\n", "# extract returns a tuple containing the markdown as a string and total time\n", - "key_value_result, total_time = ap.extract_json(example_local_file, extract_instruction)\n", + "key_value_result, total_time = ap.extract_key_value(example_local_file, extract_instruction)\n", "\n", "display(key_value_result)\n", "print(total_time)" diff --git a/pyproject.toml b/pyproject.toml index 1ead1b4..c31b651 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,19 +1,26 @@ [tool.poetry] name = "any-parser" -version = "0.0.16" +version = "0.0.17" description = "Parser for all." authors = ["CambioML "] maintainers = ["Rachel Hu "] readme = "README.md" [tool.poetry.dependencies] -python = ">=3.8,<3.13" +python = ">=3.9,<3.13" requests = "^2.25.0" python-dotenv = "^1.0.0" + +[tool.poetry.group.dev.dependencies] Levenshtein = [ { version = "0.25.1", python = "<3.9" }, { version = "0.26.0", python = ">=3.9" } ] +black = "^24.8.0" +isort = "^5.13.2" +autoflake = "^2.3.1" +pytest = "^8.3.3" +pre-commit = "^4.0.1" [build-system] requires = ["poetry-core"] diff --git a/tests/README.md b/tests/README.md index e7f4166..a4b475d 100644 --- a/tests/README.md +++ b/tests/README.md @@ -6,11 +6,34 @@ These tests are written using the unittest framework in Python. The tests are lo ## Setup 1. Install the required packages by running the following command: ```bash -pip install Levenshtein +poetry install ``` +In the `dev.dependencies` section of the `pyproject.toml` file, you will see the packages that are installed. -## Running Tests -1. Make sure you are in the root folder. +2. Add a `.env` file in the `tests` folder with the following content: +```bash +API_KEY=************* +``` + +## Pre-commit +This project uses pre-commit to run checks before committing code. To initialize `pre-commit` for this repo, run the following command: +```bash +pre-commit install +``` + +Now, with every commit, the checks will run automatically on the files added to the commit. The checks include: +- `black` for code formatting +- `flake8` for linting +- `isort` for import sorting +- running the unit tests in `tests/test.py` + +If you want to run the checks manually, you can run the following command: +```bash +pre-commit run --all-files +``` + +## Running Tests Manually +1. Make sure you are in the project root folder. 2. Run the following command: ```bash ./run_tests.sh diff --git a/tests/test.py b/tests/test.py index 0332812..c24b3d9 100755 --- a/tests/test.py +++ b/tests/test.py @@ -177,12 +177,12 @@ def test_image_async_extract_and_fetch(self): percentage, 90, f"Output similarity too low: {percentage:.2f}%" ) - def test_sync_extract_json(self): + def test_sync_extract_key_value(self): """Synchronous JSON Extraction with subtests for different file formats""" for data in EXTRACT_JSON_TEST_DATA: with self.subTest(working_file=data["working_file"]): # extract - key_value_result, elapsed_time = self.ap.extract_json( + key_value_result, elapsed_time = self.ap.extract_key_value( data["working_file"], data["extract_instruction"] ) @@ -190,19 +190,17 @@ def test_sync_extract_json(self): self.assertEqual(key_value_result, data["correct_output"]) self.assertIn("Time Elapsed", elapsed_time) - def test_async_extract_json_and_fetch(self): + def test_async_extract_key_value_and_fetch(self): """Asynchronous JSON Extraction with subtests for different file formats""" for data in EXTRACT_JSON_TEST_DATA: with self.subTest(working_file=data["working_file"]): # extract - file_id = self.ap.async_extract_json( + file_id = self.ap.async_extract_key_value( data["working_file"], data["extract_instruction"] ) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch - key_value_result = self.ap.async_fetch( - file_id=file_id, result_type="json" - ) + key_value_result = self.ap.async_fetch(file_id=file_id) # assertions self.assertEqual(key_value_result, data["correct_output"]) # wait 1 s between requests