diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index e3550ac..00ac450 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -25,6 +25,7 @@ class ProcessType(Enum): TABLE = "table" FILE_REFINED = "file_refined" FILE_REFINED_QUICK = "file_refined_quick" + PARSE_WITH_LAYOUT = "parse_with_layout" class AnyParser: @@ -42,6 +43,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None """ self._sync_extract_url = f"{base_url}/extract" self._sync_json_url = f"{base_url}/json/extract" + self._sync_resume_url = f"{base_url}/resume/extract" self._sync_refined_url = f"{base_url}/refined_parse" self._async_upload_url = f"{base_url}/async/upload" self._async_fetch_url = f"{base_url}/async/fetch" @@ -187,6 +189,66 @@ def extract_key_value( else: return f"Error: {response.status_code} {response.text}", None + def extract_resume_key_value( + self, + file_path: str, + ) -> Tuple[str, str]: + """Extract resume in real-time. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + tuple(str, str): The extracted data and the time taken. + extracted data includes: + - "education": Education + - "work_experience": Work Experience + - "personal_info": Personal Information + - "skills": Skills + - "certifications": Certifications + - "projects": Projects + - "pii": Personally Identifiable Information - includes only name, email, and phone + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + if error: + return error, None + + # Encode the file content in base64 + with open(file_path, "rb") as file: + encoded_file = base64.b64encode(file.read()).decode("utf-8") + + # Create the JSON payload + payload = { + "file_content": encoded_file, + "file_type": file_extension, + } + + # Send the POST request + start_time = time.time() + response = requests.post( + self._sync_resume_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + end_time = time.time() + + # Check if the request was successful + if response.status_code == 200: + try: + response_data = response.json() + result = response_data["extraction_result"] + return ( + result, + f"Time Elapsed: {end_time - start_time:.2f} seconds", + ) + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", None + else: + return f"Error: {response.status_code} {response.text}", None + def async_extract( self, file_path: str, @@ -244,6 +306,44 @@ def async_extract( # If response successful, upload the file return upload_file_to_presigned_url(file_path, response) + def async_parse_with_layout(self, file_path: str) -> str: + """Extract full content from a file asynchronously. + + Compared with `async_extract`, this method will first analyze the layout of the file. + Then it will process text, tables, and images separately; + and return the combined result in markdown format. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + str: The file id of the uploaded file. + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + + if error: + return error, None + + file_name = Path(file_path).name + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": "parse_with_layout", + } + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) + def async_extract_key_value( self, file_path: str, @@ -286,6 +386,44 @@ def async_extract_key_value( # If response successful, upload the file return upload_file_to_presigned_url(file_path, response) + def async_extract_resume_key_value( + self, + file_path: str, + ) -> str: + """Extract key-value pairs from a file asynchronously. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + str: The file id of the uploaded file. + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + + if error: + return error, None + + file_name = Path(file_path).name + + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": "resume_extract", + } + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) + def async_fetch( self, file_id: str, @@ -337,6 +475,8 @@ def async_fetch( result = response.json() if "json" in result: return result["json"] + elif "resume_extraction" in result: + return result["resume_extraction"] elif "markdown" in result: markdown_list = result["markdown"] return "\n".join(markdown_list) diff --git a/examples/async_extract_resume_key_value.ipynb b/examples/async_extract_resume_key_value.ipynb new file mode 100644 index 0000000..5eb2535 --- /dev/null +++ b/examples/async_extract_resume_key_value.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ap = AnyParser(api_key=\"...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"./sample_data/resume_1.pdf\"\n", + "file_id = ap.async_extract_resume_key_value(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n" + ] + } + ], + "source": [ + "json_result = ap.async_fetch(file_id=file_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pii': {'full_name': 'Gary Jiang',\n", + " 'email': 'jiangzhehuan0105@gmail.com',\n", + " 'phone': '+1 (213) 725-7637'},\n", + " 'education': [{'organization': 'Shenyang University of Technology',\n", + " 'degree': \"Bachelor's Degree\",\n", + " 'major': 'Computer Science',\n", + " 'start_date': '2008-01-01',\n", + " 'end_date': '2012-12-31',\n", + " 'courses': None,\n", + " 'achievements': None}],\n", + " 'work_experience': [{'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIMMERSE',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'job_type': None,\n", + " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications.',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience.',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication.',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement.',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment.']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIKING SASQUATCH',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-01',\n", + " 'job_type': None,\n", + " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies.',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience.',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance.',\n", + " 'Managed data storage and security by implementing a MySQL database.',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement.']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'ROX PAY SRL',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'job_type': None,\n", + " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity.',\n", + " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript.',\n", + " 'Contributed developing backend utilizing Django/Python.']},\n", + " {'job_title': 'Freelancer',\n", + " 'company_name': 'FREELANCE',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'job_type': None,\n", + " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", + " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance.',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular.',\n", + " 'Developed NFT marketplace websites and wrote smart contracts.']},\n", + " {'job_title': 'Server Administrator, Java Developer',\n", + " 'company_name': 'NEUSOFT',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'job_type': None,\n", + " 'summary': 'Worked as intern and software developer after graduated university.',\n", + " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues.',\n", + " 'Operating Systems & Security Software.',\n", + " 'Java / Spring Boot / Hibernate.']}],\n", + " 'personal_info': {'name': 'GARY JIANG',\n", + " 'phone_number': '+1-213-725-7637',\n", + " 'address': None,\n", + " 'email_address': 'jiangzhehuan0105@gmail.com',\n", + " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n", + " 'github_url': None,\n", + " 'summary': None},\n", + " 'skills': {'Programming Languages': ['Python',\n", + " 'PHP',\n", + " 'Javascript',\n", + " 'Typescript',\n", + " 'HTML',\n", + " 'CSS'],\n", + " 'Tools': ['Flask',\n", + " 'Django',\n", + " 'FastAPI',\n", + " 'Laravel',\n", + " 'Node.js',\n", + " 'SQL databases',\n", + " 'Next.js',\n", + " 'React',\n", + " 'Redux',\n", + " 'Nuxt.js',\n", + " 'Vue',\n", + " 'AWS Lambda',\n", + " 'Cognito',\n", + " 'EC2',\n", + " 'S3',\n", + " 'DynamoDB',\n", + " 'API Gateway',\n", + " 'Flutter',\n", + " 'Ionic',\n", + " 'Angular',\n", + " 'Git',\n", + " 'Version Control'],\n", + " 'Other': ['Startup Experience',\n", + " 'Adaptable',\n", + " 'Resourceful',\n", + " 'Prioritization',\n", + " 'Hybrid Mobile App Development',\n", + " 'AGILE',\n", + " 'SCRUM',\n", + " 'DevOps',\n", + " 'CI/CD']},\n", + " 'certifications': [],\n", + " 'projects': [{'organization': 'VIMMERSE',\n", + " 'project_name': 'AI-powered Editor Web Application',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'descriptions': ['Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications.',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience.',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication.',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement.',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment.']},\n", + " {'organization': 'VIKING SASQUATCH',\n", + " 'project_name': 'Real Estate Transactions API and Integrations',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-30',\n", + " 'descriptions': ['Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies.',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience.',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance.',\n", + " 'Managed data storage and security by implementing a MySQL database.',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement.']},\n", + " {'organization': 'ROX PAY SRL',\n", + " 'project_name': 'Fintech Software House',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'descriptions': ['Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n", + " 'Developed front-end by using React.js and Redux, Javascript/Typescript.',\n", + " 'Contributed developing backend utilizing Django/Python.']},\n", + " {'organization': 'FREELANCE',\n", + " 'project_name': 'Various Web and Mobile Applications',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'descriptions': ['Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", + " 'Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance.',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular.',\n", + " 'Developed NFT marketplace websites and wrote smart contracts.']},\n", + " {'organization': 'NEUSOFT',\n", + " 'project_name': 'Server Administration and Java Development',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'descriptions': ['Worked as intern and software developer after graduated university.',\n", + " 'Correct analytical and reasoning skills to troubleshoot and repair server issues.',\n", + " 'Operating Systems & Security Software.',\n", + " 'Java / Spring Boot / Hibernate.']}]}" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(json_result)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/async_parse_with_layout.ipynb b/examples/async_parse_with_layout.ipynb new file mode 100644 index 0000000..138ea58 --- /dev/null +++ b/examples/async_parse_with_layout.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ap = AnyParser(api_key=\"...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"./sample_data/test_1figure_1table.png\"\n", + "file_id = ap.async_parse_with_layout(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n" + ] + } + ], + "source": [ + "markdown_output = ap.async_fetch(file_id=file_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "
latency | (ms) | |
participants | mean | 99th percentile |
1 | 17.0 +1.4 | 75.0 34.9 |
2 | 24.5 +2.5 | 87.6 +35.9 |
5 | 31.5 +6.2 | 104.5 52.2 |
10 | 30.0 +3.7 | 95.6 +25.4 |
25 | 35.5 +5.6 | 100.4 42.7 |
50 | 42.7 4.1 | 93.7 22.9 |
100 | 71.4 7.6 | 131.2 +17.6 |
200 | 150.5 +11.0 | 320.3 35.1 |