Merge branch 'main' into extract-resume

CambioML · Oct 25, 2024 · ad0fc39 · ad0fc39
2 parents 2636f64 + cde5b95
commit ad0fc39
Show file tree

Hide file tree

Showing 9 changed files with 167 additions and 137 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 
 ## :seedling: Set up your AnyParser API key
 
-To get started, generate your API key from the [Playground Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.
+To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.
 
 > ⚠️ **Note:** The free API is limited to 10 pages/call.
 
@@ -25,18 +25,18 @@ CAMBIO_API_KEY=0cam************************
 
 ## :computer: Installation
 ### 1. Set Up a New Conda Environment and Install AnyParser
-First, create and activate a new Conda environment, then install AnyParser: 
+First, create and activate a new Conda environment, then install AnyParser:
 ```bash
 conda create -n any-parse python=3.10 -y
 conda activate any-parse
 pip3 install any-parser
 ```
 ### 2. Create an AnyParser Instance Using Your API Key
-Use your API key to create an instance of AnyParserRT. Make sure you’ve set up your .env file to store your API key securely:
+Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely:
 ```python
 import os
 from dotenv import load_dotenv
-from any_parser import AnyParserRT  # Import the AnyParserRT class
+from any_parser import AnyParser
 
 # Load environment variables
 load_dotenv(override=True)
@@ -45,7 +45,7 @@ load_dotenv(override=True)
 example_apikey = os.getenv("CAMBIO_API_KEY")
 
 # Create an AnyParser instance
-ap = AnyParserRT(api_key=example_apikey)
+ap = AnyParser(api_key=example_apikey)
 ```
 
 ### 3. Run Synchronous Extraction
@@ -73,4 +73,3 @@ Are you an AI engineer looking to **accurately** extract both the text and layou
 
 ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb)
 Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb).
-
diff --git a/any_parser/__init__.py b/any_parser/__init__.py
@@ -4,4 +4,4 @@
 
 __all__ = ["AnyParser", "ModelType"]
 
-__version__ = "0.0.16"
+__version__ = "0.0.17"
diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
@@ -9,33 +9,15 @@
 
 import requests
 
+from any_parser.utils import (
+    ModelType,
+    check_file_type_and_path,
+    check_model,
+    upload_file_to_presigned_url,
+)
+
 PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com"
 TIMEOUT = 60
-SUPPORTED_FILE_EXTENSIONS = [
-    "pdf",
-    "doc",
-    "docx",
-    "ppt",
-    "pptx",
-    "jpg",
-    "jpeg",
-    "png",
-    "gif",
-]
-RESULT_TYPES = ["markdown", "json"]
-RESUME_EXTRACT_TYPES = [
-    "education",
-    "work_experience",
-    "personal_info",
-    "skills",
-    "certifications",
-    "projects",
-]
-
-
-class ModelType(Enum):
-    BASE = "base"
-    PRO = "pro"
 
 
 class ProcessType(Enum):
@@ -76,23 +58,28 @@ def extract(
         model: ModelType = ModelType.BASE,
         extract_args: Optional[Dict] = None,
     ) -> Tuple[str, str]:
-        """Extract data in real-time.
+        """Extract full content from a file in real-time.
 
         Args:
             file_path (str): The path to the file to be parsed.
-            extract_args (Optional[Dict]): Additional extraction arguments added to prompt
+            model (ModelType): The model to use for extraction. Can be
+                `ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`.
+            extract_args (Optional[Dict]): Additional extraction arguments added
+                to the prompt.
+
         Returns:
             tuple(str, str): The extracted data and the time taken.
         """
+
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
         # Check if the file exists and file_type
-        error = self._check_file_type_and_path(file_path, file_extension)
+        error = check_file_type_and_path(file_path, file_extension)
 
         if error:
             return error, None
 
-        error = self._check_model(model)
+        error = check_model(model)
         if error:
             return error, None
 
@@ -145,24 +132,24 @@ def extract(
         else:
             return f"Error: {response.status_code} {response.text}", None
 
-    def extract_json(
+    def extract_key_value(
         self,
         file_path: str,
         extract_instruction: Dict,
     ) -> Tuple[str, str]:
-        """Extract json in real-time.
+        """Extract key-value pairs from a file in real-time.
 
         Args:
             file_path (str): The path to the file to be parsed.
-            extract_instruction (Dict): A dictionary containing the keys to be extracted,
-                with their values as the description of those keys.
+            extract_instruction (Dict): A dictionary containing the keys to be
+                extracted, with their values as the description of those keys.
         Returns:
             tuple(str, str): The extracted data and the time taken.
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
         # Check if the file exists and file_type
-        error = self._check_file_type_and_path(file_path, file_extension)
+        error = check_file_type_and_path(file_path, file_extension)
         if error:
             return error, None
 
@@ -274,23 +261,25 @@ def async_extract(
         model: ModelType = ModelType.BASE,
         extract_args: Optional[Dict] = None,
     ) -> str:
-        """Extract data asynchronously.
+        """Extract full content from a file asynchronously.
 
         Args:
             file_path (str): The path to the file to be parsed.
+            model (ModelType): The model to use for extraction. Can be
+                `ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`.
             extract_args (Optional[Dict]): Additional extraction arguments added to prompt
         Returns:
             str: The file id of the uploaded file.
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
         # Check if the file exists and file_type
-        error = self._check_file_type_and_path(file_path, file_extension)
+        error = check_file_type_and_path(file_path, file_extension)
 
         if error:
             return error, None
 
-        error = self._check_model(model)
+        error = check_model(model)
         if error:
             return error, None
 
@@ -321,26 +310,26 @@ def async_extract(
         )
 
         # If response successful, upload the file
-        return self._upload_file_to_presigned_url(file_path, response)
+        return upload_file_to_presigned_url(file_path, response)
 
-    def async_extract_json(
+    def async_extract_key_value(
         self,
         file_path: str,
         extract_instruction: Dict,
     ) -> str:
-        """Extract data asynchronously.
+        """Extract key-value pairs from a file asynchronously.
 
         Args:
             file_path (str): The path to the file to be parsed.
-            extract_instruction (Dict): A dictionary containing the keys to be extracted,
-                with their values as the description of those keys.
+            extract_instruction (Dict): A dictionary containing the keys to be
+                extracted, with their values as the description of those keys.
         Returns:
             str: The file id of the uploaded file.
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
         # Check if the file exists and file_type
-        error = self._check_file_type_and_path(file_path, file_extension)
+        error = check_file_type_and_path(file_path, file_extension)
 
         if error:
             return error, None
@@ -363,15 +352,14 @@ def async_extract_json(
         )
 
         # If response successful, upload the file
-        return self._upload_file_to_presigned_url(file_path, response)
+        return upload_file_to_presigned_url(file_path, response)
 
     def async_fetch(
         self,
         file_id: str,
         sync: bool = True,
         sync_timeout: int = 60,
         sync_interval: int = 5,
-        result_type: str = "markdown",
     ) -> str:
         """Fetches extraction results asynchronously.
 
@@ -380,13 +368,11 @@ def async_fetch(
             sync (bool, optional): Whether to wait for the results synchronously.
             sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60.
             sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5.
-            result_type (string, optional): The type of result to fetch. Defaults to `markdown`.
 
         Returns:
             str: The extracted results as a markdown string.
             None: If the extraction is still in progress (when sync is False).
         """
-        self._check_result_type(result_type)
 
         response = None
         # Create the JSON payload
@@ -416,58 +402,13 @@ def async_fetch(
         if response is None:
             return "Error: timeout, no response received"
         if response.status_code == 200:
-            if result_type == "json":
-                return response.json()["json"]
-            else:
-                markdown_list = response.json()["markdown"]
+            result = response.json()
+            if "json" in result:
+                return result["json"]
+            elif "markdown" in result:
+                markdown_list = result["markdown"]
                 return "\n".join(markdown_list)
+            return f"Error: Invalid response format\n {result}"
         if response.status_code == 202:
             return None
         return f"Error: {response.status_code} {response.text}"
-
-    def _upload_file_to_presigned_url(
-        self, file_path: str, response: requests.Response
-    ) -> str:
-        if response.status_code == 200:
-            try:
-                file_id = response.json().get("fileId")
-                presigned_url = response.json().get("presignedUrl")
-                with open(file_path, "rb") as file_to_upload:
-                    files = {"file": (file_path, file_to_upload)}
-                    upload_resp = requests.post(
-                        presigned_url["url"],
-                        data=presigned_url["fields"],
-                        files=files,
-                        timeout=TIMEOUT,
-                    )
-                    if upload_resp.status_code != 204:
-                        return f"Error: {upload_resp.status_code} {upload_resp.text}"
-                return file_id
-            except json.JSONDecodeError:
-                return "Error: Invalid JSON response"
-        else:
-            return f"Error: {response.status_code} {response.text}"
-
-    def _check_model(self, model: ModelType) -> None:
-        if model not in {ModelType.BASE, ModelType.PRO}:
-            valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
-            return f"Invalid model type: {model}. Supported `model` types include {valid_models}."
-
-    def _check_file_type_and_path(self, file_path, file_extension):
-        # Check if the file exists
-        if not Path(file_path).is_file():
-            return f"Error: File does not exist: {file_path}"
-
-        if file_extension not in SUPPORTED_FILE_EXTENSIONS:
-            supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
-            return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
-
-    def _check_result_type(self, result_type: str) -> None:
-        if result_type not in RESULT_TYPES:
-            valid_result_types = ", ".join(RESULT_TYPES)
-            return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}."
-
-    def _check_resume_extract_type(self, extract_type: str) -> None:
-        if extract_type not in RESUME_EXTRACT_TYPES:
-            valid_extract_types = ", ".join(RESUME_EXTRACT_TYPES)
-            return f"Invalid extract type: {extract_type}. Supported `extract_type` types include {valid_extract_types}."
diff --git a/any_parser/utils.py b/any_parser/utils.py
@@ -0,0 +1,63 @@
+import json
+from enum import Enum
+from pathlib import Path
+
+import requests
+
+
+class ModelType(Enum):
+    BASE = "base"
+    PRO = "pro"
+
+
+SUPPORTED_FILE_EXTENSIONS = [
+    "pdf",
+    "doc",
+    "docx",
+    "ppt",
+    "pptx",
+    "jpg",
+    "jpeg",
+    "png",
+    "gif",
+]
+
+
+def upload_file_to_presigned_url(
+    file_path: str, response: requests.Response, timeout: int = 10
+) -> str:
+    if response.status_code == 200:
+        try:
+            file_id = response.json().get("fileId")
+            presigned_url = response.json().get("presignedUrl")
+            with open(file_path, "rb") as file_to_upload:
+                files = {"file": (file_path, file_to_upload)}
+                upload_resp = requests.post(
+                    presigned_url["url"],
+                    data=presigned_url["fields"],
+                    files=files,
+                    timeout=timeout,
+                )
+                if upload_resp.status_code != 204:
+                    return f"Error: {upload_resp.status_code} {upload_resp.text}"
+            return file_id
+        except json.JSONDecodeError:
+            return "Error: Invalid JSON response"
+    else:
+        return f"Error: {response.status_code} {response.text}"
+
+
+def check_model(model: ModelType) -> None:
+    if model not in {ModelType.BASE, ModelType.PRO}:
+        valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
+        return f"Invalid model type: {model}. Supported `model` types include {valid_models}."
+
+
+def check_file_type_and_path(file_path, file_extension):
+    # Check if the file exists
+    if not Path(file_path).is_file():
+        return f"Error: File does not exist: {file_path}"
+
+    if file_extension not in SUPPORTED_FILE_EXTENSIONS:
+        supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
+        return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@

		__all__ = ["AnyParser", "ModelType"]

		__version__ = "0.0.16"
		__version__ = "0.0.17"