From 3f5cdf0397deac84f45900b7fee6bee0e93a98aa Mon Sep 17 00:00:00 2001 From: Jingyi Date: Mon, 2 Dec 2024 07:03:39 +0000 Subject: [PATCH] refactor any parser classes --- any_parser/any_parser.py | 126 ++++++++------------------- any_parser/async_parser.py | 59 ++++++++++++- any_parser/base_parser.py | 5 -- any_parser/sync_parser.py | 171 ++++++++++++++++++++++++++++++++++--- 4 files changed, 254 insertions(+), 107 deletions(-) diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index a97f8f9..ac52c92 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -10,7 +10,13 @@ from any_parser.async_parser import AsyncParser from any_parser.constants import ProcessType -from any_parser.sync_parser import SyncParser +from any_parser.sync_parser import ( + ExtractKeyValueSyncParser, + ExtractPIISyncParser, + ExtractResumeKeyValueSyncParser, + ExtractTablesSyncParser, + ParseSyncParser, +) from any_parser.utils import validate_file_inputs PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com" @@ -119,8 +125,14 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None api_key: Authentication key for API access base_url: API endpoint URL, defaults to public endpoint """ - self._sync_parser = SyncParser(api_key, base_url) self._async_parser = AsyncParser(api_key, base_url) + self._sync_parse = ParseSyncParser(api_key, base_url) + self._sync_extract_key_value = ExtractKeyValueSyncParser(api_key, base_url) + self._sync_extract_resume_key_value = ExtractResumeKeyValueSyncParser( + api_key, base_url + ) + self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url) + self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url) @handle_file_processing def parse( @@ -141,23 +153,13 @@ def parse( Returns: tuple: (result, timing_info) or (error_message, "") """ - response, info = self._sync_parser.get_sync_response( - self._sync_parser._sync_parse_url, - file_content=file_content, # type: ignore - file_type=file_type, # type: ignore + return self._sync_parse.parse( + file_path=file_path, + file_content=file_content, + file_type=file_type, extract_args=extract_args, ) - if response is None: - return info, "" - - try: - response_data = response.json() - result = response_data["markdown"] - return result, f"Time Elapsed: {info}" - except json.JSONDecodeError: - return f"Error: Invalid JSON response: {response.text}", "" - @handle_file_processing def extract_pii( self, @@ -168,23 +170,12 @@ def extract_pii( """ Extract PII data from a file synchronously. """ - response, info = self._sync_parser.get_sync_response( - self._sync_parser._sync_extract_pii, - file_content=file_content, # type: ignore - file_type=file_type, # type: ignore - extract_args=None, + return self._sync_extract_pii.extract( + file_path=file_path, + file_content=file_content, + file_type=file_type, ) - if response is None: - return info, "" - - try: - response_data = response.json() - result = response_data["pii_extraction"] - return result, f"Time Elapsed: {info}" - except json.JSONDecodeError: - return f"Error: Invalid JSON response: {response.text}", "" - @handle_file_processing def extract_tables( self, @@ -199,23 +190,12 @@ def extract_tables( Returns: tuple(str, str): The extracted data and the time taken. """ - response, info = self._sync_parser.get_sync_response( - self._sync_parser._sync_extract_tables, - file_content=file_content, # type: ignore - file_type=file_type, # type: ignore - extract_args=None, + return self._sync_extract_tables.extract( + file_path=file_path, + file_content=file_content, + file_type=file_type, ) - if response is None: - return info, "" - - try: - response_data = response.json() - result = response_data["markdown"] - return result, f"Time Elapsed: {info}" - except json.JSONDecodeError: - return f"Error: Invalid JSON response: {response.text}", "" - @handle_file_processing def extract_key_value( self, @@ -233,23 +213,13 @@ def extract_key_value( Returns: tuple(str, str): The extracted data and the time taken. """ - response, info = self._sync_parser.get_sync_response( - self._sync_parser._sync_extract_key_value, - file_content=file_content, # type: ignore - file_type=file_type, # type: ignore + return self._sync_extract_key_value.extract( + file_path=file_path, + file_content=file_content, + file_type=file_type, extract_args={"extract_instruction": extract_instruction}, ) - if response is None: - return info, "" - - try: - response_data = response.json() - result = response_data["json"] - return result, f"Time Elapsed: {info}" - except json.JSONDecodeError: - return f"Error: Invalid JSON response: {response.text}", "" - @handle_file_processing def extract_resume_key_value( self, file_path=None, file_content=None, file_type=None @@ -270,23 +240,12 @@ def extract_resume_key_value( - "pii": Personally Identifiable Information - includes only name, email, and phone """ - response, info = self._sync_parser.get_sync_response( - self._sync_parser._sync_extract_resume_key_value, - file_content=file_content, # type: ignore - file_type=file_type, # type: ignore - extract_args=None, + return self._sync_extract_resume_key_value.extract( + file_path=file_path, + file_content=file_content, + file_type=file_type, ) - if response is None: - return info, "" - - try: - response_data = response.json() - result = response_data["extraction_result"] - return result, f"Time Elapsed: {info}" - except json.JSONDecodeError: - return f"Error: Invalid JSON response: {response.text}", "" - # Example of decorated methods: @handle_file_processing def async_parse( @@ -425,19 +384,4 @@ def async_fetch( timeout=TIMEOUT, ) - if response is None: - return "Error: timeout, no response received" - if response.status_code == 200: - result = response.json() - if "json" in result: - return result["json"] - elif "resume_extraction" in result: - return result["resume_extraction"] - elif "pii_extraction" in result: - return result["pii_extraction"] - elif "markdown" in result: - return result["markdown"] - return f"Error: Invalid response format\n {result}" - if response.status_code == 202: - return "" - return f"Error: {response.status_code} {response.text}" + return self._async_parser.handle_async_response(response) diff --git a/any_parser/async_parser.py b/any_parser/async_parser.py index ad941c3..7ca194c 100644 --- a/any_parser/async_parser.py +++ b/any_parser/async_parser.py @@ -13,8 +13,48 @@ TIMEOUT = 60 +class BasePostProcessor: + def __init__(self, successor=None) -> None: + self.successor = successor + + def process(self, json_response: Dict) -> str: + if self.successor: + return self.successor.process(json_response) + return f"Error: Invalid JSON response: {json_response}" + + +class ParsePostProcessor(BasePostProcessor): + def process(self, json_response: Dict) -> str: + if "markdown" in json_response: + return json_response["markdown"] + return super().process(json_response) + + +class KeyValuePostProcessor(BasePostProcessor): + def process(self, json_response: Dict) -> str: + if "json" in json_response: + return json_response["json"] + return super().process(json_response) + + +class ExtractPIIPostProcessor(BasePostProcessor): + def process(self, json_response: Dict) -> str: + if "pii_extraction" in json_response: + return json_response["pii_extraction"] + return super().process(json_response) + + +class ExtractResumeKeyValuePostProcessor(BasePostProcessor): + + def process(self, json_response: Dict) -> str: + if "resume_extraction" in json_response: + return json_response["resume_extraction"] + return super().process(json_response) + + class AsyncParser(BaseParser): - def _setup_endpoints(self) -> None: + def __init__(self, api_key: str, base_url: str) -> None: + super().__init__(api_key, base_url) self._async_upload_url = f"{self._base_url}/async/upload" self._async_fetch_url = f"{self._base_url}/async/fetch" @@ -58,3 +98,20 @@ def send_async_request( # If response successful, upload the file return upload_file_to_presigned_url(file_content, response) + + def handle_async_response(self, response) -> str: + if response is None: + return "Error: timeout, no response received" + if response.status_code == 202: + return "" + if response.status_code == 200: + extract_resume_processor = ExtractResumeKeyValuePostProcessor() + key_value_processor = KeyValuePostProcessor(extract_resume_processor) + extract_pii_processor = ExtractPIIPostProcessor(key_value_processor) + handler = ParsePostProcessor(extract_pii_processor) + try: + return handler.process(response.json()) + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}" + + return f"Error: {response.status_code} {response.text}" diff --git a/any_parser/base_parser.py b/any_parser/base_parser.py index 0c33034..963c025 100644 --- a/any_parser/base_parser.py +++ b/any_parser/base_parser.py @@ -9,8 +9,3 @@ def __init__(self, api_key: str, base_url: str) -> None: "Content-Type": "application/json", "x-api-key": self._api_key, } - self._setup_endpoints() - - def _setup_endpoints(self) -> None: - """Setup API endpoints - to be implemented by child classes.""" - raise NotImplementedError diff --git a/any_parser/sync_parser.py b/any_parser/sync_parser.py index 012d8e8..25d918e 100644 --- a/any_parser/sync_parser.py +++ b/any_parser/sync_parser.py @@ -11,16 +11,7 @@ TIMEOUT = 60 -class SyncParser(BaseParser): - def _setup_endpoints(self) -> None: - self._sync_parse_url = f"{self._base_url}/parse" - self._sync_extract_pii = f"{self._base_url}/extract_pii" - self._sync_extract_tables = f"{self._base_url}/extract_tables" - self._sync_extract_key_value = f"{self._base_url}/extract_key_value" - self._sync_extract_resume_key_value = ( - f"{self._base_url}/extract_resume_key_value" - ) - self._sync_parse_with_ocr = f"{self._base_url}/parse_with_ocr" +class BaseSyncParser(BaseParser): def get_sync_response( self, @@ -49,3 +40,163 @@ def get_sync_response( return None, f"Error: {response.status_code} {response.text}" return response, f"{end_time - start_time:.2f} seconds" + + def parse( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + """Converts the given file to markdown.""" + raise NotImplementedError + + def extract( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + """Extracts information from the given file.""" + raise NotImplementedError + + +class ParseSyncParser(BaseSyncParser): + """Parse parser implementation.""" + + def parse( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + response, info = self.get_sync_response( + f"{self._base_url}/parse", + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore + extract_args=extract_args, + ) + + if response is None: + return info, "" + + try: + response_data = response.json() + result = response_data["markdown"] + return result, f"Time Elapsed: {info}" + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", "" + + +class ExtractPIISyncParser(BaseSyncParser): + """Extract PII parser implementation.""" + + def extract( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + response, info = self.get_sync_response( + f"{self._base_url}/extract_pii", + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore + extract_args=None, + ) + + if response is None: + return info, "" + + try: + response_data = response.json() + result = response_data["pii_extraction"] + return result, f"Time Elapsed: {info}" + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", "" + + +class ExtractTablesSyncParser(BaseSyncParser): + """Extract tables parser implementation.""" + + def extract( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + response, info = self.get_sync_response( + f"{self._base_url}/extract_tables", + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore + extract_args=None, + ) + + if response is None: + return info, "" + + try: + response_data = response.json() + result = response_data["markdown"] + return result, f"Time Elapsed: {info}" + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", "" + + +class ExtractKeyValueSyncParser(BaseSyncParser): + """Extract key-value parser implementation.""" + + def extract( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + response, info = self.get_sync_response( + f"{self._base_url}/extract_key_value", + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore + extract_args={"extract_instruction": extract_args}, + ) + + if response is None: + return info, "" + + try: + response_data = response.json() + result = response_data["json"] + return result, f"Time Elapsed: {info}" + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", "" + + +class ExtractResumeKeyValueSyncParser(BaseSyncParser): + """Extract resume key-value parser implementation.""" + + def extract( + self, + file_path=None, + file_content=None, + file_type=None, + extract_args=None, + ): + response, info = self.get_sync_response( + f"{self._base_url}/extract_resume_key_value", + file_content=file_content, # type: ignore + file_type=file_type, # type: ignore + extract_args=None, + ) + + if response is None: + return info, "" + + try: + response_data = response.json() + result = response_data["extraction_result"] + return result, f"Time Elapsed: {info}" + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", ""