From f33917638d75ddf87eeb219fa0e5d7c58bb32326 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Mon, 18 Nov 2024 13:13:39 +0000 Subject: [PATCH 1/5] feat: support file_content as input --- any_parser/any_parser.py | 504 +++++++++++++++---------------------- any_parser/async_parser.py | 80 ++++++ any_parser/base_parser.py | 59 +++++ any_parser/sync_parser.py | 51 ++++ any_parser/utils.py | 76 +++++- tests/test.py | 36 ++- 6 files changed, 476 insertions(+), 330 deletions(-) create mode 100644 any_parser/async_parser.py create mode 100644 any_parser/base_parser.py create mode 100644 any_parser/sync_parser.py diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 0f6f8ba..e949508 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -3,28 +3,20 @@ import base64 import json import time -from enum import Enum from pathlib import Path -from typing import Dict, Optional, Tuple +from tempfile import NamedTemporaryFile import requests -from any_parser.utils import check_file_type_and_path, upload_file_to_presigned_url +from any_parser.async_parser import AsyncParser +from any_parser.base_parser import ProcessType +from any_parser.sync_parser import SyncParser +from any_parser.utils import validate_parser_inputs PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" TIMEOUT = 60 -class ProcessType(Enum): - EXTRACT_PII = "extract_pii" - EXTRACT_TABLES = "extract_tables" - EXTRACT_KEY_VALUE = "extract_key_value" - EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value" - PARSE = "parse" - PARSE_WITH_OCR = "parse_with_ocr" - PARSE_WITH_LAYOUT = "parse_with_layout" - - class AnyParser: """AnyParser RT: Real-time parser for any data format.""" @@ -38,186 +30,113 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None Returns: None """ - self._sync_parse_url = f"{base_url}/parse" - self._sync_extract_pii = f"{base_url}/extract_pii" - self._sync_extract_tables = f"{base_url}/extract_tables" - self._sync_extract_key_value = f"{base_url}/extract_key_value" - self._sync_extract_resume_key_value = f"{base_url}/extract_resume_key_value" - self._sync_parse_with_ocr = f"{base_url}/parse_with_ocr" - self._async_upload_url = f"{base_url}/async/upload" - self._async_fetch_url = f"{base_url}/async/fetch" - self._api_key = api_key - self._headers = { - "Content-Type": "application/json", - "x-api-key": self._api_key, - } - - def _get_sync_response( - self, url_endpoint: str, file_path: str, extract_args: Optional[Dict] = None - ) -> Tuple[Optional[requests.Response], str]: - """Extract full content from a file in real-time. - - Args: - url_endpoint (str): The URL endpoint for the API. - file_path (str): The path to the file to be parsed. - extract_args (Optional[Dict]): Additional extraction arguments. - - Returns: - tuple(requests.Response | None, str): The response object and the - time taken. If the file is invalid or the API request fails, - returns None and an error message. - """ - - file_extension = Path(file_path).suffix.lower().lstrip(".") - - # Check if the file exists and file_type - error = check_file_type_and_path(file_path, file_extension) - if error: - return None, error - - # Encode the file content in base64 - with open(file_path, "rb") as file: - encoded_file = base64.b64encode(file.read()).decode("utf-8") - - # Create the JSON payload - payload = { - "file_content": encoded_file, - "file_type": file_extension, - } - - if extract_args is not None and isinstance(extract_args, dict): - payload["extract_args"] = extract_args - - # Send the POST request - start_time = time.time() - response = requests.post( - url_endpoint, - headers=self._headers, - data=json.dumps(payload), - timeout=TIMEOUT, - ) - end_time = time.time() - - if response.status_code != 200: - return None, f"Error: {response.status_code} {response.text}" - - return response, f"{end_time - start_time:.2f} seconds" - - def _send_async_request( - self, - process_type: ProcessType, - file_path: str, - extract_args: Optional[Dict] = None, - ) -> str: - """Extract full content from a file asynchronously. - - Args: - url_endpoint (str): The URL endpoint for the API. - file_path (str): The path to the file to be parsed. - process_type (ProcessType): The type of processing to be done. - extract_args (Optional[Dict]): Additional extraction arguments. - - Returns: - str: The file id of the uploaded file. - """ - file_extension = Path(file_path).suffix.lower().lstrip(".") - - # Check if the file exists and file_type - error = check_file_type_and_path(file_path, file_extension) - if error: - return error - - file_name = Path(file_path).name + self._sync_parser = SyncParser(api_key, base_url) + self._async_parser = AsyncParser(api_key, base_url) + + @staticmethod + def handle_parsing(func): + """Decorator to handle common file processing logic.""" + + def wrapper( + self, + file_path=None, + file_content=None, + file_type=None, + *args, + **kwargs, + ): + # pylint: disable=too-many-arguments + # Validate inputs + is_valid, error_message = validate_parser_inputs( + file_path=file_path, + file_content=file_content, + file_type=file_type, + ) - # Create the JSON payload - payload = { - "file_name": file_name, - "process_type": process_type.value, - } - - if extract_args is not None and isinstance(extract_args, dict): - payload["extract_args"] = extract_args - - # Send the POST request - response = requests.post( - self._async_upload_url, - headers=self._headers, - data=json.dumps(payload), - timeout=TIMEOUT, - ) + if not is_valid: + return error_message, "" + + # Encode the file content in base64 if file_path is provided + if file_content is None: + assert file_path is not None # Type narrowing for mypy + try: + with open(file_path, "rb") as file: + file_content = base64.b64encode(file.read()).decode("utf-8") + file_type = Path(file_path).suffix.lower().lstrip(".") + except Exception as e: + return f"Error: {e}", "" + + return func( + self, + file_path=file_path, + file_content=file_content, + file_type=file_type, + *args, + **kwargs, + ) - # If response successful, upload the file - return upload_file_to_presigned_url(file_path, response) + return wrapper + @handle_parsing def parse( self, - file_path: str, - extract_args: Optional[Dict] = None, - ) -> Tuple[str, str]: - """Extract full content from a file in real-time. - - Args: - file_path (str): The path to the file to be parsed. - extract_args (Optional[Dict]): Additional extraction arguments added - to the prompt. - - Returns: - tuple(str, str): The extracted data and the time taken. - """ - - response, info = self._get_sync_response( - self._sync_parse_url, - file_path, + file_path, + file_content=None, + file_type=None, + extract_args=None, + ): + """Extract full content from a file in real-time.""" + response, info = self._sync_parser.get_sync_response( + self._sync_parser._sync_parse_url, + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore extract_args=extract_args, ) + if response is None: return info, "" try: response_data = response.json() - response_list = [] - for text in response_data["markdown"]: - response_list.append(text) - markdown_text = "\n".join(response_list) - return ( - markdown_text, - f"Time Elapsed: {info}", - ) + result = "\n".join( + response_data["markdown"] + ) # Using direct extraction instead of extract_key + return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" + @handle_parsing def extract_pii( self, - file_path: str, - ) -> Tuple[str, str]: - """Extract personally identifiable information (PII) from a file in real-time. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - tuple(str, str): The extracted data and the time taken. - """ - response, info = self._get_sync_response( - self._sync_extract_pii, - file_path, + file_path=None, + file_content=None, + file_type=None, + ): + """Extract PII from a file in real-time.""" + response, info = self._sync_parser.get_sync_response( + self._sync_parser._sync_extract_pii, + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore extract_args=None, ) + if response is None: return info, "" try: response_data = response.json() - return ( - response_data["pii_extraction"], - f"Time Elapsed: {info}", - ) + result = response_data["pii_extraction"] + return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" + @handle_parsing def extract_tables( self, - file_path: str, - ) -> Tuple[str, str]: + file_path=None, + file_content=None, + file_type=None, + ): """Extract tables from a file in real-time. Args: @@ -225,32 +144,31 @@ def extract_tables( Returns: tuple(str, str): The extracted data and the time taken. """ - response, info = self._get_sync_response( - self._sync_extract_tables, - file_path, + response, info = self._sync_parser.get_sync_response( + self._sync_parser._sync_extract_tables, + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore extract_args=None, ) + if response is None: return info, "" try: response_data = response.json() - response_list = [] - for text in response_data["markdown"]: - response_list.append(text) - markdown_text = "\n".join(response_list) - return ( - markdown_text, - f"Time Elapsed: {info}", - ) + result = "\n".join(response_data["markdown"]) + return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" + @handle_parsing def extract_key_value( self, - file_path: str, - extract_instruction: Dict, - ) -> Tuple[str, str]: + file_path=None, + file_content=None, + file_type=None, + extract_instruction=None, + ): """Extract key-value pairs from a file in real-time. Args: @@ -260,28 +178,27 @@ def extract_key_value( Returns: tuple(str, str): The extracted data and the time taken. """ - response, info = self._get_sync_response( - self._sync_extract_key_value, - file_path, + response, info = self._sync_parser.get_sync_response( + self._sync_parser._sync_extract_key_value, + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore extract_args={"extract_instruction": extract_instruction}, ) + if response is None: return info, "" try: response_data = response.json() result = response_data["json"] - return ( - result, - f"Time Elapsed: {info}", - ) + return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" + @handle_parsing def extract_resume_key_value( - self, - file_path: str, - ) -> Tuple[str, str]: + self, file_path=None, file_content=None, file_type=None + ): """Extract resume in real-time. Args: @@ -295,144 +212,114 @@ def extract_resume_key_value( - "skills": Skills - "certifications": Certifications - "projects": Projects - - "pii": Personally Identifiable Information - includes only name, email, and phone + - "pii": Personally Identifiable Information - includes + only name, email, and phone """ - response, info = self._get_sync_response( - self._sync_extract_resume_key_value, - file_path, + response, info = self._sync_parser.get_sync_response( + self._sync_parser._sync_extract_resume_key_value, + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore extract_args=None, ) + if response is None: return info, "" try: response_data = response.json() result = response_data["extraction_result"] - return ( - result, - f"Time Elapsed: {info}", - ) + return result, f"Time Elapsed: {info}" except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - def async_parse( - self, - file_path: str, - extract_args: Optional[Dict] = None, - ) -> str: - """Extract full content from a file asynchronously. - - Args: - file_path (str): The path to the file to be parsed. - extract_args (Optional[Dict]): Additional extraction arguments added to prompt - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.PARSE, - file_path, - extract_args=extract_args, + @staticmethod + def handle_async_parsing(func): + """Decorator to handle common async file processing logic.""" + + def wrapper( + self, + file_path=None, + file_content=None, + file_type=None, + *args, + **kwargs, + ): + # Validate inputs + is_valid, error_message = validate_parser_inputs( + file_path=file_path, + file_content=file_content, + file_type=file_type, + ) + print("is_valid", is_valid) + + if not is_valid: + return error_message + + # Dump the file content into a NamedTemporaryFile if file_path + # is not provided + if file_path: + file_type = Path(file_path).suffix.lower().lstrip(".") + else: + file_path = NamedTemporaryFile(delete=False, suffix=file_type).name + with open(file_path, "wb") as file: + file.write(base64.b64decode(file_content)) # type: ignore + + # Call the actual function with processed arguments + return func(self, file_path=file_path, *args, **kwargs) + + return wrapper + + # Example of decorated methods: + @handle_async_parsing + def async_parse(self, file_path, extract_args=None): + """Extract full content from a file asynchronously.""" + return self._async_parser.send_async_request( + ProcessType.PARSE, file_path, extract_args ) - def async_parse_with_layout(self, file_path: str) -> str: - """Extract full content from a file asynchronously. - - Compared with `async_extract`, this method will first analyze the layout of the file. - Then it will process text, tables, and images separately; - and return the combined result in markdown format. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.PARSE_WITH_LAYOUT, - file_path, - extract_args=None, + @handle_async_parsing + def async_parse_with_layout(self, file_path): + """Extract content from a file asynchronously with layout analysis.""" + return self._async_parser.send_async_request( + ProcessType.PARSE_WITH_LAYOUT, file_path ) - def async_parse_with_ocr(self, file_path: str) -> str: - """Extract full content from a file asynchronously. - - Compared with `async_extract`, this method will first perform OCR on the file. - Then it will process text, tables, and images separately; - and return the combined result in markdown format. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.PARSE_WITH_OCR, - file_path, - extract_args=None, + @handle_async_parsing + def async_parse_with_ocr(self, file_path): + """Extract full content from a file asynchronously with OCR.""" + return self._async_parser.send_async_request( + ProcessType.PARSE_WITH_OCR, file_path ) - def async_extract_pii(self, file_path: str) -> str: - """Extract personally identifiable information (PII) from a file asynchronously. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.EXTRACT_PII, - file_path, - extract_args=None, + @handle_async_parsing + def async_extract_pii(self, file_path, extract_args=None): + """Extract PII from a file asynchronously.""" + return self._async_parser.send_async_request( + ProcessType.EXTRACT_PII, file_path, extract_args ) - def async_extract_tables(self, file_path: str) -> str: - """Extract tables from a file asynchronously. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.EXTRACT_TABLES, - file_path, - extract_args=None, + @handle_async_parsing + def async_extract_tables(self, file_path): + """Extract tables from a file asynchronously.""" + return self._async_parser.send_async_request( + ProcessType.EXTRACT_TABLES, file_path ) - def async_extract_key_value( - self, - file_path: str, - extract_instruction: Dict, - ) -> str: - """Extract key-value pairs from a file asynchronously. - - Args: - file_path (str): The path to the file to be parsed. - extract_instruction (Dict): A dictionary containing the keys to be - extracted, with their values as the description of those keys. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( + @handle_async_parsing + def async_extract_key_value(self, file_path, extract_instruction=None): + """Extract key-value pairs from a file asynchronously.""" + print("reached async extract key value") + return self._async_parser.send_async_request( ProcessType.EXTRACT_KEY_VALUE, file_path, extract_args={"extract_instruction": extract_instruction}, ) - def async_extract_resume_key_value( - self, - file_path: str, - ) -> str: - """Extract key-value pairs from a file asynchronously. - - Args: - file_path (str): The path to the file to be parsed. - Returns: - str: The file id of the uploaded file. - """ - return self._send_async_request( - ProcessType.EXTRACT_RESUME_KEY_VALUE, - file_path, - extract_args=None, + @handle_async_parsing + def async_extract_resume_key_value(self, file_path): + """Extract resume key-value pairs from a file asynchronously.""" + return self._async_parser.send_async_request( + ProcessType.EXTRACT_RESUME_KEY_VALUE, file_path, extract_args=None ) def async_fetch( @@ -446,9 +333,12 @@ def async_fetch( Args: file_id (str): The ID of the file to fetch results for. - sync (bool, optional): Whether to wait for the results synchronously. - sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60. - sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5. + sync (bool, optional): Whether to wait for the results + synchronously. + sync_timeout (int, optional): Maximum time to wait for results in + seconds. Defaults to 60. + sync_interval (int, optional): Time interval between polling + attempts in seconds. Defaults to 5. Returns: str: The extracted results as a markdown string. @@ -462,8 +352,8 @@ def async_fetch( start_time = time.time() while time.time() < start_time + sync_timeout: response = requests.post( - self._async_fetch_url, - headers=self._headers, + self._async_parser._async_fetch_url, + headers=self._async_parser._headers, data=json.dumps(payload), timeout=TIMEOUT, ) @@ -474,8 +364,8 @@ def async_fetch( break else: response = requests.post( - self._async_fetch_url, - headers=self._headers, + self._async_parser._async_fetch_url, + headers=self._async_parser._headers, data=json.dumps(payload), timeout=TIMEOUT, ) @@ -495,5 +385,5 @@ def async_fetch( return "\n".join(markdown_list) return f"Error: Invalid response format\n {result}" if response.status_code == 202: - return None + return "" return f"Error: {response.status_code} {response.text}" diff --git a/any_parser/async_parser.py b/any_parser/async_parser.py new file mode 100644 index 0000000..a222ae0 --- /dev/null +++ b/any_parser/async_parser.py @@ -0,0 +1,80 @@ +"""Asynchronous parser implementation.""" + +import base64 +import json +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Dict, Optional + +import requests + +from any_parser.base_parser import BaseParser, ProcessType +from any_parser.utils import upload_file_to_presigned_url + +TIMEOUT = 60 + + +class AsyncParser(BaseParser): + def _setup_endpoints(self) -> None: + self._async_upload_url = f"{self._base_url}/async/upload" + self._async_fetch_url = f"{self._base_url}/async/fetch" + + def _async_process_file( + self, + file_path: Optional[str], + file_content: Optional[str], + file_type: Optional[str], + ): + file_path, file_type, error = self._process_file( + file_path, file_content, file_type + ) + if error: + return None, None, error + + if not file_path: + temp_file = NamedTemporaryFile(delete=False, suffix=file_type) + file_path = temp_file.name + with open(file_path, "wb") as file: + file.write(base64.b64decode(file_content)) # type: ignore + + return file_path, file_type, None + + def send_async_request( + self, + process_type: ProcessType, + file_path: str, + extract_args: Optional[Dict] = None, + ) -> str: + """Extract full content from a file asynchronously. + + Args: + url_endpoint (str): The URL endpoint for the API. + file_path (str): The path to the file to be parsed. + process_type (ProcessType): The type of processing to be done. + extract_args (Optional[Dict]): Additional extraction arguments. + + Returns: + str: The file id of the uploaded file. + """ + + file_name = Path(file_path).name + + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": process_type.value, + } + + if extract_args is not None and isinstance(extract_args, dict): + payload["extract_args"] = extract_args # type: ignore + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) diff --git a/any_parser/base_parser.py b/any_parser/base_parser.py new file mode 100644 index 0000000..aa3f9d8 --- /dev/null +++ b/any_parser/base_parser.py @@ -0,0 +1,59 @@ +"""Base parser implementation.""" + +import base64 +from enum import Enum +from pathlib import Path +from typing import Optional, Tuple + +from any_parser.utils import validate_parser_inputs + + +class ProcessType(Enum): + EXTRACT_PII = "extract_pii" + EXTRACT_TABLES = "extract_tables" + EXTRACT_KEY_VALUE = "extract_key_value" + EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value" + PARSE = "parse" + PARSE_WITH_OCR = "parse_with_ocr" + PARSE_WITH_LAYOUT = "parse_with_layout" + + +class BaseParser: + def __init__(self, api_key: str, base_url: str) -> None: + self._api_key = api_key + self._base_url = base_url + self._headers = { + "Content-Type": "application/json", + "x-api-key": self._api_key, + } + self._setup_endpoints() + + def _setup_endpoints(self) -> None: + """Setup API endpoints - to be implemented by child classes.""" + raise NotImplementedError + + def _process_file( + self, + file_path: Optional[str], + file_content: Optional[str], + file_type: Optional[str], + ) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """Common file processing logic.""" + # Validate inputs + is_valid, error_message = validate_parser_inputs( + file_path=file_path, file_content=file_content, file_type=file_type + ) + + if not is_valid: + return None, None, error_message + + if file_content is None and file_path is not None: + try: + with open(file_path, "rb") as file: + file_read = file.read() + file_content = base64.b64encode(file_read).decode("utf-8") + file_type = Path(file_path).suffix.lower().lstrip(".") + except Exception as e: + return None, None, f"Error: {e}" + + return file_content, file_type, None diff --git a/any_parser/sync_parser.py b/any_parser/sync_parser.py new file mode 100644 index 0000000..012d8e8 --- /dev/null +++ b/any_parser/sync_parser.py @@ -0,0 +1,51 @@ +"""Synchronous parser implementation.""" + +import json +import time +from typing import Any, Dict, Optional, Tuple + +import requests + +from any_parser.base_parser import BaseParser + +TIMEOUT = 60 + + +class SyncParser(BaseParser): + def _setup_endpoints(self) -> None: + self._sync_parse_url = f"{self._base_url}/parse" + self._sync_extract_pii = f"{self._base_url}/extract_pii" + self._sync_extract_tables = f"{self._base_url}/extract_tables" + self._sync_extract_key_value = f"{self._base_url}/extract_key_value" + self._sync_extract_resume_key_value = ( + f"{self._base_url}/extract_resume_key_value" + ) + self._sync_parse_with_ocr = f"{self._base_url}/parse_with_ocr" + + def get_sync_response( + self, + url_endpoint: str, + file_content: str, + file_type: str, + extract_args: Optional[Dict[str, Any]] = None, + ) -> Tuple[Optional[requests.Response], str]: + payload = { + "file_content": file_content, + "file_type": file_type, + } + if extract_args: + payload["extract_args"] = extract_args # type: ignore + + start_time = time.time() + response = requests.post( + url_endpoint, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + end_time = time.time() + + if response.status_code != 200: + return None, f"Error: {response.status_code} {response.text}" + + return response, f"{end_time - start_time:.2f} seconds" diff --git a/any_parser/utils.py b/any_parser/utils.py index 7795865..b98d6fd 100644 --- a/any_parser/utils.py +++ b/any_parser/utils.py @@ -1,5 +1,7 @@ import json +from enum import Enum from pathlib import Path +from typing import Optional, Tuple import requests @@ -16,6 +18,70 @@ ] +class ValidationError(Enum): + MISSING_INPUTS = "Either file_content or file_path must be provided" + MISSING_FILE_TYPE = "file_type must be provided when using file_content" + NOT_FOUND = "File does not exist: {}" + UNSUPPORTED_FILE_TYPE = "Unsupported file type: {}. Supported file types: {}" + FILE_EMPTY = "File is empty: {}" + FILE_TOO_LARGE = "File size exceeds maximum limit of {} MB: {}" + OTHER = "{}" + + +def validate_parser_inputs( + file_path: Optional[str], + file_content: Optional[str], + file_type: Optional[str], +) -> Tuple[bool, str]: + """Validate inputs for the parser. + + Args: + file_content (Optional[str]): Base64 encoded file content + file_path (Optional[str]): Path to the file + file_type (Optional[str]): File extension/type + + Returns: + Tuple[bool, str]: (is_valid, error_message) + - is_valid: True if validation passes, False otherwise + - error_message: "" if validation passes, error if validation fails + """ + print("file_content", file_content) + print("file_type", file_type) + print("file_path", file_path) + # Check if at least one input method is provided + if file_content is None and file_path is None: + return False, ValidationError.MISSING_INPUTS.value + + # Validate file_content path + if file_content is not None and file_type is None: + return False, ValidationError.MISSING_FILE_TYPE.value + + # Validate file path if provided + if file_path is not None: + path = Path(file_path) + + # Check if file exists + if not path.is_file(): + return False, ValidationError.NOT_FOUND.value.format(file_path) + + # Check if file is empty + if path.stat().st_size == 0: + return False, ValidationError.FILE_EMPTY.value.format(file_path) + + # If file_type not provided, extract it from file_path + if file_type is None: + file_type = path.suffix.lower().lstrip(".") + + # Validate file type + if file_type not in SUPPORTED_FILE_EXTENSIONS: + supported_types = ", ".join(sorted(SUPPORTED_FILE_EXTENSIONS)) + return False, ValidationError.UNSUPPORTED_FILE_TYPE.value.format( + file_type, supported_types + ) + + return True, "" + + def upload_file_to_presigned_url( file_path: str, response: requests.Response, timeout: int = 10 ) -> str: @@ -38,13 +104,3 @@ def upload_file_to_presigned_url( return "Error: Invalid JSON response" else: return f"Error: {response.status_code} {response.text}" - - -def check_file_type_and_path(file_path, file_extension): - # Check if the file exists - if not Path(file_path).is_file(): - return f"Error: File does not exist: {file_path}" - - if file_extension not in SUPPORTED_FILE_EXTENSIONS: - supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) - return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." diff --git a/tests/test.py b/tests/test.py index 33b89ea..3bde42f 100755 --- a/tests/test.py +++ b/tests/test.py @@ -23,7 +23,10 @@ def get_ground_truth(file_path: str) -> str: def compare_markdown(generated_output: str, correct_output: str) -> float: - """Compare the generated markdown to the correct markdown using Levenshtein Distance.""" + """ + Compare the generated markdown to the correct markdown using + Levenshtein Distance. + """ distance = Levenshtein.distance(generated_output, correct_output) max_len = max(len(generated_output), len(correct_output)) @@ -47,7 +50,8 @@ def test_pdf_sync_parse(self): correct_output_file = "./tests/outputs/correct_pdf_output.txt" # extract - markdown, elapsed_time = self.ap.parse(working_file) + markdown, elapsed_time = self.ap.parse(file_path=working_file) + self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -63,7 +67,7 @@ def test_pdf_async_parse_and_fetch(self): correct_output_file = "./tests/outputs/correct_pdf_output.txt" # extract - file_id = self.ap.async_parse(working_file) + file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch markdown = self.ap.async_fetch(file_id=file_id) @@ -81,7 +85,7 @@ def test_docx_sync_extract(self): correct_output_file = "./tests/outputs/correct_docx_output.txt" # extract - markdown, elapsed_time = self.ap.parse(working_file) + markdown, elapsed_time = self.ap.parse(file_path=working_file) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -97,7 +101,7 @@ def test_docx_async_parse_and_fetch(self): correct_output_file = "./tests/outputs/correct_docx_output.txt" # extract - file_id = self.ap.async_parse(working_file) + file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch markdown = self.ap.async_fetch(file_id=file_id) @@ -115,7 +119,7 @@ def test_pptx_sync_extract(self): correct_output_file = "./tests/outputs/correct_pptx_output.txt" # extract - markdown, elapsed_time = self.ap.parse(working_file) + markdown, elapsed_time = self.ap.parse(file_path=working_file) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -131,7 +135,7 @@ def test_pptx_async_parse_and_fetch(self): correct_output_file = "./tests/outputs/correct_pptx_output.txt" # extract - file_id = self.ap.async_parse(working_file) + file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch markdown = self.ap.async_fetch(file_id=file_id) @@ -149,7 +153,7 @@ def test_image_sync_extract(self): correct_output_file = "./tests/outputs/correct_png_output.txt" # extract - markdown, elapsed_time = self.ap.parse(working_file) + markdown, elapsed_time = self.ap.parse(file_path=working_file) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) percentage = compare_markdown(markdown, correct_output) @@ -165,7 +169,7 @@ def test_image_async_parse_and_fetch(self): correct_output_file = "./tests/outputs/correct_png_output.txt" # extract - file_id = self.ap.async_parse(working_file) + file_id = self.ap.async_parse(file_path=working_file) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch markdown = self.ap.async_fetch(file_id=file_id) @@ -178,12 +182,15 @@ def test_image_async_parse_and_fetch(self): ) def test_sync_extract_key_value(self): - """Synchronous JSON Extraction with subtests for different file formats""" + """ + Synchronous JSON Extraction with subtests for different file formats + """ for data in EXTRACT_JSON_TEST_DATA: with self.subTest(working_file=data["working_file"]): # extract key_value_result, elapsed_time = self.ap.extract_key_value( - data["working_file"], data["extract_instruction"] + file_path=data["working_file"], + extract_instruction=data["extract_instruction"], ) # assertions @@ -191,12 +198,15 @@ def test_sync_extract_key_value(self): self.assertIn("Time Elapsed", elapsed_time) def test_async_extract_key_value_and_fetch(self): - """Asynchronous JSON Extraction with subtests for different file formats""" + """ + Asynchronous JSON Extraction with subtests for different file formats + """ for data in EXTRACT_JSON_TEST_DATA: with self.subTest(working_file=data["working_file"]): # extract file_id = self.ap.async_extract_key_value( - data["working_file"], data["extract_instruction"] + file_path=data["working_file"], + extract_instruction=data["extract_instruction"], ) self.assertFalse(file_id.startswith("Error:"), file_id) # fetch From 5654cac490ef38321565e78add8bc76ad60f4ca8 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Mon, 18 Nov 2024 13:43:55 +0000 Subject: [PATCH 2/5] feat: support file_content as input --- any_parser/any_parser.py | 2 +- tests/test.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index e949508..fba83e4 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -80,7 +80,7 @@ def wrapper( @handle_parsing def parse( self, - file_path, + file_path=None, file_content=None, file_type=None, extract_args=None, diff --git a/tests/test.py b/tests/test.py index 3bde42f..e4afe72 100755 --- a/tests/test.py +++ b/tests/test.py @@ -1,9 +1,11 @@ """Testing Synchronous and Asynchronous Extraction""" +import base64 import os import sys import time import unittest +from pathlib import Path import Levenshtein from dotenv import load_dotenv @@ -61,6 +63,29 @@ def test_pdf_sync_parse(self): ) self.assertIn("Time Elapsed", elapsed_time) + def test_pdf_sync_parse_with_file_content(self): + """Synchronous PDF Parse with file content""" + working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" + correct_output_file = "./tests/outputs/correct_pdf_output.txt" + + with open(working_file, "rb") as file: + file_content = base64.b64encode(file.read()).decode("utf-8") + file_type = Path(working_file).suffix.lower().lstrip(".") + + # extract + markdown, elapsed_time = self.ap.parse( + file_content=file_content, file_type=file_type + ) # pylint: disable=too-many-arguments + + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + self.assertIn("Time Elapsed", elapsed_time) + def test_pdf_async_parse_and_fetch(self): """Asynchronous PDF Parse and Fetch""" working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" From 1feec685fb0fcd85762744f7b613023b1f3c78ee Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Mon, 18 Nov 2024 15:13:02 +0000 Subject: [PATCH 3/5] test: update test cases --- any_parser/any_parser.py | 68 ++++++++++++++++++++++++++++---------- any_parser/async_parser.py | 22 ------------ any_parser/utils.py | 3 -- tests/test.py | 24 +++++++++++++- 4 files changed, 73 insertions(+), 44 deletions(-) diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index fba83e4..ffcb226 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -250,7 +250,6 @@ def wrapper( file_content=file_content, file_type=file_type, ) - print("is_valid", is_valid) if not is_valid: return error_message @@ -260,7 +259,10 @@ def wrapper( if file_path: file_type = Path(file_path).suffix.lower().lstrip(".") else: - file_path = NamedTemporaryFile(delete=False, suffix=file_type).name + file_path = NamedTemporaryFile( + delete=False, suffix=f".{file_type}" + ).name + print(file_path) with open(file_path, "wb") as file: file.write(base64.b64decode(file_content)) # type: ignore @@ -271,55 +273,85 @@ def wrapper( # Example of decorated methods: @handle_async_parsing - def async_parse(self, file_path, extract_args=None): + def async_parse( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): """Extract full content from a file asynchronously.""" return self._async_parser.send_async_request( - ProcessType.PARSE, file_path, extract_args + process_type=ProcessType.PARSE, + file_path=file_path, # type: ignore + extract_args=extract_args, ) @handle_async_parsing - def async_parse_with_layout(self, file_path): + def async_parse_with_layout( + self, file_path=None, file_content=None, file_type=None + ): """Extract content from a file asynchronously with layout analysis.""" return self._async_parser.send_async_request( - ProcessType.PARSE_WITH_LAYOUT, file_path + process_type=ProcessType.PARSE_WITH_LAYOUT, + file_path=file_path, # type: ignore ) @handle_async_parsing - def async_parse_with_ocr(self, file_path): + def async_parse_with_ocr(self, file_path=None, file_content=None, file_type=None): """Extract full content from a file asynchronously with OCR.""" return self._async_parser.send_async_request( - ProcessType.PARSE_WITH_OCR, file_path + process_type=ProcessType.PARSE_WITH_OCR, + file_path=file_path, # type: ignore ) @handle_async_parsing - def async_extract_pii(self, file_path, extract_args=None): + def async_extract_pii( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): """Extract PII from a file asynchronously.""" return self._async_parser.send_async_request( - ProcessType.EXTRACT_PII, file_path, extract_args + process_type=ProcessType.EXTRACT_PII, + file_path=file_path, # type: ignore + extract_args=extract_args, ) @handle_async_parsing - def async_extract_tables(self, file_path): + def async_extract_tables(self, file_path=None, file_content=None, file_type=None): """Extract tables from a file asynchronously.""" return self._async_parser.send_async_request( - ProcessType.EXTRACT_TABLES, file_path + process_type=ProcessType.EXTRACT_TABLES, + file_path=file_path, # type: ignore ) @handle_async_parsing - def async_extract_key_value(self, file_path, extract_instruction=None): + def async_extract_key_value( + self, + file_path=None, + file_content=None, + file_type=None, + extract_instruction=None, + ): """Extract key-value pairs from a file asynchronously.""" - print("reached async extract key value") return self._async_parser.send_async_request( - ProcessType.EXTRACT_KEY_VALUE, - file_path, + process_type=ProcessType.EXTRACT_KEY_VALUE, + file_path=file_path, # type: ignore extract_args={"extract_instruction": extract_instruction}, ) @handle_async_parsing - def async_extract_resume_key_value(self, file_path): + def async_extract_resume_key_value( + self, file_path=None, file_content=None, file_type=None + ): """Extract resume key-value pairs from a file asynchronously.""" return self._async_parser.send_async_request( - ProcessType.EXTRACT_RESUME_KEY_VALUE, file_path, extract_args=None + process_type=ProcessType.EXTRACT_RESUME_KEY_VALUE, + file_path=file_path, # type: ignore + extract_args=None, ) def async_fetch( diff --git a/any_parser/async_parser.py b/any_parser/async_parser.py index a222ae0..4fb8cd6 100644 --- a/any_parser/async_parser.py +++ b/any_parser/async_parser.py @@ -1,9 +1,7 @@ """Asynchronous parser implementation.""" -import base64 import json from pathlib import Path -from tempfile import NamedTemporaryFile from typing import Dict, Optional import requests @@ -19,26 +17,6 @@ def _setup_endpoints(self) -> None: self._async_upload_url = f"{self._base_url}/async/upload" self._async_fetch_url = f"{self._base_url}/async/fetch" - def _async_process_file( - self, - file_path: Optional[str], - file_content: Optional[str], - file_type: Optional[str], - ): - file_path, file_type, error = self._process_file( - file_path, file_content, file_type - ) - if error: - return None, None, error - - if not file_path: - temp_file = NamedTemporaryFile(delete=False, suffix=file_type) - file_path = temp_file.name - with open(file_path, "wb") as file: - file.write(base64.b64decode(file_content)) # type: ignore - - return file_path, file_type, None - def send_async_request( self, process_type: ProcessType, diff --git a/any_parser/utils.py b/any_parser/utils.py index b98d6fd..b63a4e6 100644 --- a/any_parser/utils.py +++ b/any_parser/utils.py @@ -45,9 +45,6 @@ def validate_parser_inputs( - is_valid: True if validation passes, False otherwise - error_message: "" if validation passes, error if validation fails """ - print("file_content", file_content) - print("file_type", file_type) - print("file_path", file_path) # Check if at least one input method is provided if file_content is None and file_path is None: return False, ValidationError.MISSING_INPUTS.value diff --git a/tests/test.py b/tests/test.py index e4afe72..f1992bc 100755 --- a/tests/test.py +++ b/tests/test.py @@ -75,7 +75,7 @@ def test_pdf_sync_parse_with_file_content(self): # extract markdown, elapsed_time = self.ap.parse( file_content=file_content, file_type=file_type - ) # pylint: disable=too-many-arguments + ) self.assertFalse(markdown.startswith("Error:"), markdown) correct_output = get_ground_truth(correct_output_file) @@ -104,6 +104,28 @@ def test_pdf_async_parse_and_fetch(self): percentage, 90, f"Output similarity too low: {percentage:.2f}%" ) + def test_pdf_async_parse_and_fetch_with_file_content(self): + """Asynchronous PDF Parse and Fetch with file content""" + working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" + correct_output_file = "./tests/outputs/correct_pdf_output.txt" + + with open(working_file, "rb") as file: + file_content = base64.b64encode(file.read()).decode("utf-8") + file_type = Path(working_file).suffix.lower().lstrip(".") + + # extract + file_id = self.ap.async_parse(file_content=file_content, file_type=file_type) + self.assertFalse(file_id.startswith("Error:"), file_id) + # fetch + markdown = self.ap.async_fetch(file_id=file_id) + self.assertFalse(markdown.startswith("Error:"), markdown) + correct_output = get_ground_truth(correct_output_file) + percentage = compare_markdown(markdown, correct_output) + + self.assertGreaterEqual( + percentage, 90, f"Output similarity too low: {percentage:.2f}%" + ) + def test_docx_sync_extract(self): """Synchronous Word Extraction""" working_file = "./examples/sample_data/test_odf.docx" From e9d302c44fcad4ad52bff3e7c2dbd3c907d994f7 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Mon, 18 Nov 2024 15:38:48 +0000 Subject: [PATCH 4/5] fix: fix python 3.9 build issue --- any_parser/any_parser.py | 164 +++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 83 deletions(-) diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index ffcb226..6771cd7 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -17,6 +17,87 @@ TIMEOUT = 60 +def handle_parsing(func): + """Decorator to handle common file processing logic.""" + + def wrapper( + self, + file_path=None, + file_content=None, + file_type=None, + *args, + **kwargs, + ): + # pylint: disable=too-many-arguments + # Validate inputs + is_valid, error_message = validate_parser_inputs( + file_path=file_path, + file_content=file_content, + file_type=file_type, + ) + + if not is_valid: + return error_message, "" + + # Encode the file content in base64 if file_path is provided + if file_content is None: + assert file_path is not None # Type narrowing for mypy + try: + with open(file_path, "rb") as file: + file_content = base64.b64encode(file.read()).decode("utf-8") + file_type = Path(file_path).suffix.lower().lstrip(".") + except Exception as e: + return f"Error: {e}", "" + + return func( + self, + file_path=file_path, + file_content=file_content, + file_type=file_type, + *args, + **kwargs, + ) + + return wrapper + + +def handle_async_parsing(func): + """Decorator to handle common async file processing logic.""" + + def wrapper( + self, + file_path=None, + file_content=None, + file_type=None, + *args, + **kwargs, + ): + # Validate inputs + is_valid, error_message = validate_parser_inputs( + file_path=file_path, + file_content=file_content, + file_type=file_type, + ) + + if not is_valid: + return error_message + + # Dump the file content into a NamedTemporaryFile if file_path + # is not provided + if file_path: + file_type = Path(file_path).suffix.lower().lstrip(".") + else: + file_path = NamedTemporaryFile(delete=False, suffix=f".{file_type}").name + print(file_path) + with open(file_path, "wb") as file: + file.write(base64.b64decode(file_content)) # type: ignore + + # Call the actual function with processed arguments + return func(self, file_path=file_path, *args, **kwargs) + + return wrapper + + class AnyParser: """AnyParser RT: Real-time parser for any data format.""" @@ -33,50 +114,6 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None self._sync_parser = SyncParser(api_key, base_url) self._async_parser = AsyncParser(api_key, base_url) - @staticmethod - def handle_parsing(func): - """Decorator to handle common file processing logic.""" - - def wrapper( - self, - file_path=None, - file_content=None, - file_type=None, - *args, - **kwargs, - ): - # pylint: disable=too-many-arguments - # Validate inputs - is_valid, error_message = validate_parser_inputs( - file_path=file_path, - file_content=file_content, - file_type=file_type, - ) - - if not is_valid: - return error_message, "" - - # Encode the file content in base64 if file_path is provided - if file_content is None: - assert file_path is not None # Type narrowing for mypy - try: - with open(file_path, "rb") as file: - file_content = base64.b64encode(file.read()).decode("utf-8") - file_type = Path(file_path).suffix.lower().lstrip(".") - except Exception as e: - return f"Error: {e}", "" - - return func( - self, - file_path=file_path, - file_content=file_content, - file_type=file_type, - *args, - **kwargs, - ) - - return wrapper - @handle_parsing def parse( self, @@ -232,45 +269,6 @@ def extract_resume_key_value( except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - @staticmethod - def handle_async_parsing(func): - """Decorator to handle common async file processing logic.""" - - def wrapper( - self, - file_path=None, - file_content=None, - file_type=None, - *args, - **kwargs, - ): - # Validate inputs - is_valid, error_message = validate_parser_inputs( - file_path=file_path, - file_content=file_content, - file_type=file_type, - ) - - if not is_valid: - return error_message - - # Dump the file content into a NamedTemporaryFile if file_path - # is not provided - if file_path: - file_type = Path(file_path).suffix.lower().lstrip(".") - else: - file_path = NamedTemporaryFile( - delete=False, suffix=f".{file_type}" - ).name - print(file_path) - with open(file_path, "wb") as file: - file.write(base64.b64decode(file_content)) # type: ignore - - # Call the actual function with processed arguments - return func(self, file_path=file_path, *args, **kwargs) - - return wrapper - # Example of decorated methods: @handle_async_parsing def async_parse( From fcf97b257a0daabbb7c349c7973c83d3d96b9a67 Mon Sep 17 00:00:00 2001 From: SeisSerenata Date: Tue, 19 Nov 2024 03:14:17 +0000 Subject: [PATCH 5/5] chore: update naming, docstring and decorators --- any_parser/any_parser.py | 159 ++++++++++++++++++++++--------------- any_parser/async_parser.py | 10 ++- any_parser/base_parser.py | 43 ---------- any_parser/constants.py | 11 +++ any_parser/utils.py | 33 +++++--- 5 files changed, 130 insertions(+), 126 deletions(-) create mode 100644 any_parser/constants.py diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 6771cd7..d4e4116 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -3,22 +3,64 @@ import base64 import json import time +import uuid from pathlib import Path -from tempfile import NamedTemporaryFile import requests from any_parser.async_parser import AsyncParser -from any_parser.base_parser import ProcessType +from any_parser.constants import ProcessType from any_parser.sync_parser import SyncParser -from any_parser.utils import validate_parser_inputs +from any_parser.utils import validate_file_inputs PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" TIMEOUT = 60 -def handle_parsing(func): - """Decorator to handle common file processing logic.""" +def handle_file_processing(func): + """ + Decorator to handle common file processing logic for parsing + and extraction operations. + + This decorator manages file input validation and processing, supporting + either direct file content or file path inputs. It performs base64 encoding + of file contents when a file path is provided. + + Args: + func: The decorated function that performs the actual parsing or + extraction. + + Parameters for decorated functions: + file_path (str, optional): Path to the file to be processed. If + provided, the file will be read and encoded in base64. + file_content (str, optional): Base64-encoded content of the file. If + provided, file_path will be ignored. + file_type (str, optional): The file extension/type (e.g., 'pdf'). + If not provided and file_path is given, it will be inferred from + the file extension. + *args, **kwargs: Additional arguments passed to the decorated function. + + Returns: + tuple: A tuple containing (error_message, result), where: + - error_message (str): Error message if processing fails, empty + string on success + - result (str): Empty string if error occurs, otherwise the + processed result from func + + Usage: + @handle_file_processing + def parse(self, file_path=None, file_content=None, file_type=None): + # Implementation + pass + + Note: + Either file_path or file_content must be provided, but not both. + If file_path is provided, the file content will be read and encoded in + base64, and file_type will be inferred from the file extension. + If file_content is provided, file_type will be validated, and a + temporary file path will be generated for generating presigned url(for + async parsing and extraction) + """ def wrapper( self, @@ -30,7 +72,7 @@ def wrapper( ): # pylint: disable=too-many-arguments # Validate inputs - is_valid, error_message = validate_parser_inputs( + is_valid, error_message = validate_file_inputs( file_path=file_path, file_content=file_content, file_type=file_type, @@ -40,14 +82,16 @@ def wrapper( return error_message, "" # Encode the file content in base64 if file_path is provided - if file_content is None: - assert file_path is not None # Type narrowing for mypy + if file_path: try: with open(file_path, "rb") as file: file_content = base64.b64encode(file.read()).decode("utf-8") file_type = Path(file_path).suffix.lower().lstrip(".") except Exception as e: return f"Error: {e}", "" + else: + # generate a random file path for genrating presigned url + file_path = f"/tmp/{uuid.uuid4()}.{file_type}" return func( self, @@ -61,60 +105,24 @@ def wrapper( return wrapper -def handle_async_parsing(func): - """Decorator to handle common async file processing logic.""" - - def wrapper( - self, - file_path=None, - file_content=None, - file_type=None, - *args, - **kwargs, - ): - # Validate inputs - is_valid, error_message = validate_parser_inputs( - file_path=file_path, - file_content=file_content, - file_type=file_type, - ) - - if not is_valid: - return error_message - - # Dump the file content into a NamedTemporaryFile if file_path - # is not provided - if file_path: - file_type = Path(file_path).suffix.lower().lstrip(".") - else: - file_path = NamedTemporaryFile(delete=False, suffix=f".{file_type}").name - print(file_path) - with open(file_path, "wb") as file: - file.write(base64.b64decode(file_content)) # type: ignore - - # Call the actual function with processed arguments - return func(self, file_path=file_path, *args, **kwargs) - - return wrapper - - class AnyParser: - """AnyParser RT: Real-time parser for any data format.""" + """Real-time parser for processing various data formats. + + Provides both synchronous and asynchronous methods for parsing and + extracting information from different types of files. + """ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None: - """Initialize the AnyParser RT object. + """Initialize AnyParser with API credentials. Args: - api_key (str): The API key for the AnyParser - url (str): The URL of the AnyParser RT API. - - Returns: - None + api_key: Authentication key for API access + base_url: API endpoint URL, defaults to public endpoint """ self._sync_parser = SyncParser(api_key, base_url) self._async_parser = AsyncParser(api_key, base_url) - @handle_parsing + @handle_file_processing def parse( self, file_path=None, @@ -122,7 +130,17 @@ def parse( file_type=None, extract_args=None, ): - """Extract full content from a file in real-time.""" + """Extract full content from a file synchronously. + + Args: + file_path: Path to input file + file_content: Base64 encoded file content + file_type: File format extension + extract_args: Additional extraction parameters + + Returns: + tuple: (result, timing_info) or (error_message, "") + """ response, info = self._sync_parser.get_sync_response( self._sync_parser._sync_parse_url, file_content=file_content, # type: ignore @@ -142,14 +160,16 @@ def parse( except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - @handle_parsing + @handle_file_processing def extract_pii( self, file_path=None, file_content=None, file_type=None, ): - """Extract PII from a file in real-time.""" + """ + Extract PII data from a file synchronously. + """ response, info = self._sync_parser.get_sync_response( self._sync_parser._sync_extract_pii, file_content=file_content, # type: ignore @@ -167,7 +187,7 @@ def extract_pii( except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - @handle_parsing + @handle_file_processing def extract_tables( self, file_path=None, @@ -198,7 +218,7 @@ def extract_tables( except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - @handle_parsing + @handle_file_processing def extract_key_value( self, file_path=None, @@ -232,7 +252,7 @@ def extract_key_value( except json.JSONDecodeError: return f"Error: Invalid JSON response: {response.text}", "" - @handle_parsing + @handle_file_processing def extract_resume_key_value( self, file_path=None, file_content=None, file_type=None ): @@ -270,7 +290,7 @@ def extract_resume_key_value( return f"Error: Invalid JSON response: {response.text}", "" # Example of decorated methods: - @handle_async_parsing + @handle_file_processing def async_parse( self, file_path=None, @@ -282,10 +302,11 @@ def async_parse( return self._async_parser.send_async_request( process_type=ProcessType.PARSE, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore extract_args=extract_args, ) - @handle_async_parsing + @handle_file_processing def async_parse_with_layout( self, file_path=None, file_content=None, file_type=None ): @@ -293,17 +314,19 @@ def async_parse_with_layout( return self._async_parser.send_async_request( process_type=ProcessType.PARSE_WITH_LAYOUT, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore ) - @handle_async_parsing + @handle_file_processing def async_parse_with_ocr(self, file_path=None, file_content=None, file_type=None): """Extract full content from a file asynchronously with OCR.""" return self._async_parser.send_async_request( process_type=ProcessType.PARSE_WITH_OCR, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore ) - @handle_async_parsing + @handle_file_processing def async_extract_pii( self, file_path=None, @@ -315,18 +338,20 @@ def async_extract_pii( return self._async_parser.send_async_request( process_type=ProcessType.EXTRACT_PII, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore extract_args=extract_args, ) - @handle_async_parsing + @handle_file_processing def async_extract_tables(self, file_path=None, file_content=None, file_type=None): """Extract tables from a file asynchronously.""" return self._async_parser.send_async_request( process_type=ProcessType.EXTRACT_TABLES, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore ) - @handle_async_parsing + @handle_file_processing def async_extract_key_value( self, file_path=None, @@ -338,10 +363,11 @@ def async_extract_key_value( return self._async_parser.send_async_request( process_type=ProcessType.EXTRACT_KEY_VALUE, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore extract_args={"extract_instruction": extract_instruction}, ) - @handle_async_parsing + @handle_file_processing def async_extract_resume_key_value( self, file_path=None, file_content=None, file_type=None ): @@ -349,6 +375,7 @@ def async_extract_resume_key_value( return self._async_parser.send_async_request( process_type=ProcessType.EXTRACT_RESUME_KEY_VALUE, file_path=file_path, # type: ignore + file_content=file_content, # type: ignore extract_args=None, ) diff --git a/any_parser/async_parser.py b/any_parser/async_parser.py index 4fb8cd6..ad941c3 100644 --- a/any_parser/async_parser.py +++ b/any_parser/async_parser.py @@ -6,7 +6,8 @@ import requests -from any_parser.base_parser import BaseParser, ProcessType +from any_parser.base_parser import BaseParser +from any_parser.constants import ProcessType from any_parser.utils import upload_file_to_presigned_url TIMEOUT = 60 @@ -21,14 +22,15 @@ def send_async_request( self, process_type: ProcessType, file_path: str, + file_content: str, extract_args: Optional[Dict] = None, ) -> str: """Extract full content from a file asynchronously. Args: - url_endpoint (str): The URL endpoint for the API. - file_path (str): The path to the file to be parsed. process_type (ProcessType): The type of processing to be done. + file_path (str): The path to the file to be parsed. + file_content (str): The content of the file to be parsed. extract_args (Optional[Dict]): Additional extraction arguments. Returns: @@ -55,4 +57,4 @@ def send_async_request( ) # If response successful, upload the file - return upload_file_to_presigned_url(file_path, response) + return upload_file_to_presigned_url(file_content, response) diff --git a/any_parser/base_parser.py b/any_parser/base_parser.py index aa3f9d8..0c33034 100644 --- a/any_parser/base_parser.py +++ b/any_parser/base_parser.py @@ -1,22 +1,5 @@ """Base parser implementation.""" -import base64 -from enum import Enum -from pathlib import Path -from typing import Optional, Tuple - -from any_parser.utils import validate_parser_inputs - - -class ProcessType(Enum): - EXTRACT_PII = "extract_pii" - EXTRACT_TABLES = "extract_tables" - EXTRACT_KEY_VALUE = "extract_key_value" - EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value" - PARSE = "parse" - PARSE_WITH_OCR = "parse_with_ocr" - PARSE_WITH_LAYOUT = "parse_with_layout" - class BaseParser: def __init__(self, api_key: str, base_url: str) -> None: @@ -31,29 +14,3 @@ def __init__(self, api_key: str, base_url: str) -> None: def _setup_endpoints(self) -> None: """Setup API endpoints - to be implemented by child classes.""" raise NotImplementedError - - def _process_file( - self, - file_path: Optional[str], - file_content: Optional[str], - file_type: Optional[str], - ) -> Tuple[Optional[str], Optional[str], Optional[str]]: - """Common file processing logic.""" - # Validate inputs - is_valid, error_message = validate_parser_inputs( - file_path=file_path, file_content=file_content, file_type=file_type - ) - - if not is_valid: - return None, None, error_message - - if file_content is None and file_path is not None: - try: - with open(file_path, "rb") as file: - file_read = file.read() - file_content = base64.b64encode(file_read).decode("utf-8") - file_type = Path(file_path).suffix.lower().lstrip(".") - except Exception as e: - return None, None, f"Error: {e}" - - return file_content, file_type, None diff --git a/any_parser/constants.py b/any_parser/constants.py new file mode 100644 index 0000000..3e938ad --- /dev/null +++ b/any_parser/constants.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class ProcessType(Enum): + EXTRACT_PII = "extract_pii" + EXTRACT_TABLES = "extract_tables" + EXTRACT_KEY_VALUE = "extract_key_value" + EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value" + PARSE = "parse" + PARSE_WITH_OCR = "parse_with_ocr" + PARSE_WITH_LAYOUT = "parse_with_layout" diff --git a/any_parser/utils.py b/any_parser/utils.py index b63a4e6..11d34be 100644 --- a/any_parser/utils.py +++ b/any_parser/utils.py @@ -1,3 +1,5 @@ +import base64 +import io import json from enum import Enum from pathlib import Path @@ -28,12 +30,12 @@ class ValidationError(Enum): OTHER = "{}" -def validate_parser_inputs( +def validate_file_inputs( file_path: Optional[str], file_content: Optional[str], file_type: Optional[str], ) -> Tuple[bool, str]: - """Validate inputs for the parser. + """Validate inputs for the parser or extractor. Args: file_content (Optional[str]): Base64 encoded file content @@ -80,22 +82,27 @@ def validate_parser_inputs( def upload_file_to_presigned_url( - file_path: str, response: requests.Response, timeout: int = 10 + file_content: str, response: requests.Response, timeout: int = 10 ) -> str: if response.status_code == 200: try: file_id = response.json().get("fileId") presigned_url = response.json().get("presignedUrl") - with open(file_path, "rb") as file_to_upload: - files = {"file": (file_path, file_to_upload)} - upload_resp = requests.post( - presigned_url["url"], - data=presigned_url["fields"], - files=files, - timeout=timeout, - ) - if upload_resp.status_code != 204: - return f"Error: {upload_resp.status_code} {upload_resp.text}" + + # Decode base64 content + decoded_content = base64.b64decode(file_content) + + # Create file-like object from decoded content + files = {"file": ("file", io.BytesIO(decoded_content))} + + upload_resp = requests.post( + presigned_url["url"], + data=presigned_url["fields"], + files=files, + timeout=timeout, + ) + if upload_resp.status_code != 204: + return f"Error: {upload_resp.status_code} {upload_resp.text}" return file_id except json.JSONDecodeError: return "Error: Invalid JSON response"