diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index e3550ac..00ac450 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -25,6 +25,7 @@ class ProcessType(Enum): TABLE = "table" FILE_REFINED = "file_refined" FILE_REFINED_QUICK = "file_refined_quick" + PARSE_WITH_LAYOUT = "parse_with_layout" class AnyParser: @@ -42,6 +43,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None """ self._sync_extract_url = f"{base_url}/extract" self._sync_json_url = f"{base_url}/json/extract" + self._sync_resume_url = f"{base_url}/resume/extract" self._sync_refined_url = f"{base_url}/refined_parse" self._async_upload_url = f"{base_url}/async/upload" self._async_fetch_url = f"{base_url}/async/fetch" @@ -187,6 +189,66 @@ def extract_key_value( else: return f"Error: {response.status_code} {response.text}", None + def extract_resume_key_value( + self, + file_path: str, + ) -> Tuple[str, str]: + """Extract resume in real-time. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + tuple(str, str): The extracted data and the time taken. + extracted data includes: + - "education": Education + - "work_experience": Work Experience + - "personal_info": Personal Information + - "skills": Skills + - "certifications": Certifications + - "projects": Projects + - "pii": Personally Identifiable Information - includes only name, email, and phone + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + if error: + return error, None + + # Encode the file content in base64 + with open(file_path, "rb") as file: + encoded_file = base64.b64encode(file.read()).decode("utf-8") + + # Create the JSON payload + payload = { + "file_content": encoded_file, + "file_type": file_extension, + } + + # Send the POST request + start_time = time.time() + response = requests.post( + self._sync_resume_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + end_time = time.time() + + # Check if the request was successful + if response.status_code == 200: + try: + response_data = response.json() + result = response_data["extraction_result"] + return ( + result, + f"Time Elapsed: {end_time - start_time:.2f} seconds", + ) + except json.JSONDecodeError: + return f"Error: Invalid JSON response: {response.text}", None + else: + return f"Error: {response.status_code} {response.text}", None + def async_extract( self, file_path: str, @@ -244,6 +306,44 @@ def async_extract( # If response successful, upload the file return upload_file_to_presigned_url(file_path, response) + def async_parse_with_layout(self, file_path: str) -> str: + """Extract full content from a file asynchronously. + + Compared with `async_extract`, this method will first analyze the layout of the file. + Then it will process text, tables, and images separately; + and return the combined result in markdown format. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + str: The file id of the uploaded file. + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + + if error: + return error, None + + file_name = Path(file_path).name + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": "parse_with_layout", + } + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) + def async_extract_key_value( self, file_path: str, @@ -286,6 +386,44 @@ def async_extract_key_value( # If response successful, upload the file return upload_file_to_presigned_url(file_path, response) + def async_extract_resume_key_value( + self, + file_path: str, + ) -> str: + """Extract key-value pairs from a file asynchronously. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + str: The file id of the uploaded file. + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + + if error: + return error, None + + file_name = Path(file_path).name + + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": "resume_extract", + } + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) + def async_fetch( self, file_id: str, @@ -337,6 +475,8 @@ def async_fetch( result = response.json() if "json" in result: return result["json"] + elif "resume_extraction" in result: + return result["resume_extraction"] elif "markdown" in result: markdown_list = result["markdown"] return "\n".join(markdown_list) diff --git a/examples/async_extract_resume_key_value.ipynb b/examples/async_extract_resume_key_value.ipynb new file mode 100644 index 0000000..5eb2535 --- /dev/null +++ b/examples/async_extract_resume_key_value.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ap = AnyParser(api_key=\"...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"./sample_data/resume_1.pdf\"\n", + "file_id = ap.async_extract_resume_key_value(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n" + ] + } + ], + "source": [ + "json_result = ap.async_fetch(file_id=file_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pii': {'full_name': 'Gary Jiang',\n", + " 'email': 'jiangzhehuan0105@gmail.com',\n", + " 'phone': '+1 (213) 725-7637'},\n", + " 'education': [{'organization': 'Shenyang University of Technology',\n", + " 'degree': \"Bachelor's Degree\",\n", + " 'major': 'Computer Science',\n", + " 'start_date': '2008-01-01',\n", + " 'end_date': '2012-12-31',\n", + " 'courses': None,\n", + " 'achievements': None}],\n", + " 'work_experience': [{'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIMMERSE',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'job_type': None,\n", + " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications.',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience.',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication.',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement.',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment.']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIKING SASQUATCH',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-01',\n", + " 'job_type': None,\n", + " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies.',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience.',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance.',\n", + " 'Managed data storage and security by implementing a MySQL database.',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement.']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'ROX PAY SRL',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'job_type': None,\n", + " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity.',\n", + " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript.',\n", + " 'Contributed developing backend utilizing Django/Python.']},\n", + " {'job_title': 'Freelancer',\n", + " 'company_name': 'FREELANCE',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'job_type': None,\n", + " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", + " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance.',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular.',\n", + " 'Developed NFT marketplace websites and wrote smart contracts.']},\n", + " {'job_title': 'Server Administrator, Java Developer',\n", + " 'company_name': 'NEUSOFT',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'job_type': None,\n", + " 'summary': 'Worked as intern and software developer after graduated university.',\n", + " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues.',\n", + " 'Operating Systems & Security Software.',\n", + " 'Java / Spring Boot / Hibernate.']}],\n", + " 'personal_info': {'name': 'GARY JIANG',\n", + " 'phone_number': '+1-213-725-7637',\n", + " 'address': None,\n", + " 'email_address': 'jiangzhehuan0105@gmail.com',\n", + " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n", + " 'github_url': None,\n", + " 'summary': None},\n", + " 'skills': {'Programming Languages': ['Python',\n", + " 'PHP',\n", + " 'Javascript',\n", + " 'Typescript',\n", + " 'HTML',\n", + " 'CSS'],\n", + " 'Tools': ['Flask',\n", + " 'Django',\n", + " 'FastAPI',\n", + " 'Laravel',\n", + " 'Node.js',\n", + " 'SQL databases',\n", + " 'Next.js',\n", + " 'React',\n", + " 'Redux',\n", + " 'Nuxt.js',\n", + " 'Vue',\n", + " 'AWS Lambda',\n", + " 'Cognito',\n", + " 'EC2',\n", + " 'S3',\n", + " 'DynamoDB',\n", + " 'API Gateway',\n", + " 'Flutter',\n", + " 'Ionic',\n", + " 'Angular',\n", + " 'Git',\n", + " 'Version Control'],\n", + " 'Other': ['Startup Experience',\n", + " 'Adaptable',\n", + " 'Resourceful',\n", + " 'Prioritization',\n", + " 'Hybrid Mobile App Development',\n", + " 'AGILE',\n", + " 'SCRUM',\n", + " 'DevOps',\n", + " 'CI/CD']},\n", + " 'certifications': [],\n", + " 'projects': [{'organization': 'VIMMERSE',\n", + " 'project_name': 'AI-powered Editor Web Application',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'descriptions': ['Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications.',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience.',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication.',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement.',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment.']},\n", + " {'organization': 'VIKING SASQUATCH',\n", + " 'project_name': 'Real Estate Transactions API and Integrations',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-30',\n", + " 'descriptions': ['Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies.',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience.',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance.',\n", + " 'Managed data storage and security by implementing a MySQL database.',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement.']},\n", + " {'organization': 'ROX PAY SRL',\n", + " 'project_name': 'Fintech Software House',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'descriptions': ['Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n", + " 'Developed front-end by using React.js and Redux, Javascript/Typescript.',\n", + " 'Contributed developing backend utilizing Django/Python.']},\n", + " {'organization': 'FREELANCE',\n", + " 'project_name': 'Various Web and Mobile Applications',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'descriptions': ['Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", + " 'Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance.',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular.',\n", + " 'Developed NFT marketplace websites and wrote smart contracts.']},\n", + " {'organization': 'NEUSOFT',\n", + " 'project_name': 'Server Administration and Java Development',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'descriptions': ['Worked as intern and software developer after graduated university.',\n", + " 'Correct analytical and reasoning skills to troubleshoot and repair server issues.',\n", + " 'Operating Systems & Security Software.',\n", + " 'Java / Spring Boot / Hibernate.']}]}" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(json_result)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/async_parse_with_layout.ipynb b/examples/async_parse_with_layout.ipynb new file mode 100644 index 0000000..138ea58 --- /dev/null +++ b/examples/async_parse_with_layout.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ap = AnyParser(api_key=\"...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"./sample_data/test_1figure_1table.png\"\n", + "file_id = ap.async_parse_with_layout(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n", + "Waiting for response...\n" + ] + } + ], + "source": [ + "markdown_output = ap.async_fetch(file_id=file_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
latency (ms)
participantsmean 99th percentile
1 17.0 +1.4 75.0 34.9
2 24.5 +2.5 87.6 +35.9
5 31.5 +6.2 104.5 52.2
10 30.0 +3.7 95.6 +25.4
25 35.5 +5.6 100.4 42.7
50 42.7 4.1 93.7 22.9
100 71.4 7.6 131.2 +17.6
200 150.5 +11.0320.3 35.1
\n", + "\n", + "\n", + "\n", + "Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n", + "\n", + "CPUs. Snapshot reads can execute at any up-to-date replicas, so their throughput increases almost linearly with the number of replicas. Single-read read-only transactions only execute at leaders because timestamp assignment must happen at leaders. Read-only-transaction throughput increases with the number of replicas because the number of effective spanservers increases: in the experimental setup, the number of spanservers equaled the number of replicas, and leaders were randomly distributed among the zones. Write throughput benefits from the same experimental artifact (which explains the increase in throughput from 3 to 5 replicas), but that benefit is outweighed by the linear increase in the amount of work performed per write, as the number of replicas increases.\n", + "\n", + "Table 4 demonstrates that two-phase commit can scale to a reasonable number of participants: it summarizes a set of experiments run across 3 zones, each with 25 spanservers. Scaling up to 50 participants is reasonable in both mean and 99th-percentile, and latencies start to rise noticeably at 100 participants.\n", + "\n", + "5.2 Availability\n", + "\n", + "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Zi, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n", + "\n", + "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n", + "\n", + "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n", + "\n", + "5.3 TrueTime\n", + "\n", + "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200us/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n", + "\n", + "![<@mask_p0_e1_figure(timeout=1h)>](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/5/test_1figure_1table_d9b3dc10-ac4f-4528-b259-53ef3c6d09d9.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XEYQKKGEI&Signature=uUu5fIOWpCAjXVPqlga1%2Bh8I3XM%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEJL%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCIHxChuqwVJ0iFXuWuQLlC29CcCSPJeqA2ttJbTFkugapAiBgRNF5G4USu9E%2F3B4WQ0HEgrWqxFpma3yKBVsWGriZ4SrTAwgbEAAaDDUwODYxMTI2NDQzMCIMrYarh7jskTIknIX4KrADzwVwlO670C50i1y5Br65dHMfJWKuDJtn%2BEuVXe%2BygIbvJKJJ9DDbf%2FTF88x0hj%2F4WwltXauZwy%2FPdeqyP1qSD7tRJHvv8yTTuEOB31UhVefNE0AOjPyAJcbzL360MYuPTNIShseS6hAHa0q4eL%2BEIxSanSRmCzC8X9DQRoj64y6lke%2FJIej2IfLRW%2B8B6bRvNf9eQWb%2FBF%2BV92Q%2Bl6G%2BxlENa7xQJm1Yp%2Bqkj2ysZOSs6REI0YAeLYusZ069VDyI%2BCrb%2F8%2FOKtOJBzlH7A8IM5u2FZFfPDZYUYX2YVq%2FLk1tmBTV8Pzca2gKAl6df0ouSB%2FZQ5Mzxom9ZwnWUZvCdx6weH8AupbbHO60GmwBBDJGMmcir3AwlTUZu8VbRCYH0huYIGpOCK7CzkVEt0zskstILpWi6QyYRvEFjhVUBbQagZXYMSDpMeH%2ByC%2BNEIqHk%2BEbj0JYX7Vq1bWmwlGt0bQ%2Fa2i0flAHuo4egqE3Li6QzufUWbKLQNRRPw%2F8Bh1eLT2EtP%2FyJ7J3HOWQjeUY6ejcyHQthIRJT%2FI0j%2BTHDXbpqedcrepT%2FQwptiACXOjaMNm6qbkGOp8BQ6u%2B2O05C0YMQYEKlxNnNfi%2BY1w0CTpR2udC6Jpo8r3cnaeVviSoJ2GLb6Q2WQ%2FWMJVi1OmtjgpKTRBzNOabujdnhvOwSwSpM%2BgdIRvakt%2BGo2evcO9nyYyv75fNu%2FFOmSTtrznnryZgfaU7arWoClaU0ydCVtq0JwYDIaytcRdRc9phxzqj%2BKGo1ilegNod4LtgvOGXiuv9S5aoS0sR&Expires=1730833293)\n", + "\n", + "Figure 5: Effect of killing servers on throughput.\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(markdown_output))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/extract_resume_key_value.ipynb b/examples/extract_resume_key_value.ipynb new file mode 100644 index 0000000..df76300 --- /dev/null +++ b/examples/extract_resume_key_value.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ap = AnyParser(api_key=\"...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"./sample_data/resume_1.pdf\"\n", + "json_result = ap.extract_resume_key_value(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "({'pii': {'full_name': 'GARY JIANG',\n", + " 'email': 'jiangzhehuan0105@gmail.com',\n", + " 'phone': '+1 (213) 725-7637'},\n", + " 'education': [{'organization': 'Shenyang University of Technology',\n", + " 'degree': \"Bachelor's Degree\",\n", + " 'major': 'Computer Science',\n", + " 'start_date': '2008-01-01',\n", + " 'end_date': '2012-12-31',\n", + " 'courses': None,\n", + " 'achievements': None}],\n", + " 'work_experience': [{'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIMMERSE',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'job_type': None,\n", + " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'VIKING SASQUATCH',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-01',\n", + " 'job_type': None,\n", + " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n", + " 'Managed data storage and security by implementing a MySQL database',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n", + " {'job_title': 'Full Stack Developer',\n", + " 'company_name': 'ROX PAY SRL',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'job_type': None,\n", + " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution.',\n", + " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n", + " 'Contributed developing backend utilizing Django/Python']},\n", + " {'job_title': 'Freelancer',\n", + " 'company_name': 'FREELANCE',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'job_type': None,\n", + " 'summary': 'Developed and managed many web and mobile applications while working as freelancer.',\n", + " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n", + " 'Developed NFT marketplace websites and wrote smart contracts']},\n", + " {'job_title': 'Server Administrator, Java Developer',\n", + " 'company_name': 'NEUSOFT',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'job_type': None,\n", + " 'summary': 'Worked as intern and software developer after graduated university.',\n", + " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n", + " 'Operating Systems & Security Software',\n", + " 'Java / Spring Boot / Hibernate']}],\n", + " 'personal_info': {'name': 'GARY JIANG',\n", + " 'phone_number': '+1-213-725-7637',\n", + " 'address': None,\n", + " 'email_address': 'jiangzhehuan0105@gmail.com',\n", + " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n", + " 'github_url': None,\n", + " 'summary': None},\n", + " 'skills': {'Programming Languages': ['Python',\n", + " 'PHP',\n", + " 'Javascript',\n", + " 'Typescript',\n", + " 'HTML',\n", + " 'CSS'],\n", + " 'Tools': ['Flask',\n", + " 'Django',\n", + " 'FastAPI',\n", + " 'Laravel',\n", + " 'Node.js',\n", + " 'SQL databases',\n", + " 'Next.js',\n", + " 'React',\n", + " 'Redux',\n", + " 'Nuxt.js',\n", + " 'Vue',\n", + " 'AWS Lambda',\n", + " 'AWS Cognito',\n", + " 'AWS EC2',\n", + " 'AWS S3',\n", + " 'AWS DynamoDB',\n", + " 'AWS API Gateway',\n", + " 'Flutter',\n", + " 'Ionic',\n", + " 'Angular',\n", + " 'Git',\n", + " 'Version Control'],\n", + " 'Other': ['Startup Experience',\n", + " 'Adaptable',\n", + " 'Resourceful',\n", + " 'Prioritization',\n", + " 'Hybrid Mobile App Development',\n", + " 'AGILE',\n", + " 'SCRUM',\n", + " 'DevOps',\n", + " 'CI/CD']},\n", + " 'certifications': [],\n", + " 'projects': [{'organization': 'VIMMERSE',\n", + " 'project_name': 'AI-powered Editor Web Application',\n", + " 'location': None,\n", + " 'start_date': '2023-06-01',\n", + " 'end_date': 'present',\n", + " 'descriptions': ['Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", + " 'Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications.',\n", + " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience.',\n", + " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication.',\n", + " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement.',\n", + " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment.']},\n", + " {'organization': 'VIKING SASQUATCH',\n", + " 'project_name': 'Real Estate Transactions API and Integrations',\n", + " 'location': None,\n", + " 'start_date': '2023-01-01',\n", + " 'end_date': '2023-06-30',\n", + " 'descriptions': ['Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", + " 'Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies.',\n", + " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience.',\n", + " 'Built backend APIs utilizing Node.js serverless functions for optimal performance.',\n", + " 'Managed data storage and security by implementing a MySQL database.',\n", + " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement.']},\n", + " {'organization': 'ROX PAY SRL',\n", + " 'project_name': 'Fintech Software House',\n", + " 'location': None,\n", + " 'start_date': '2021-12-01',\n", + " 'end_date': '2022-12-31',\n", + " 'descriptions': ['Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n", + " 'Developed front-end by using React.js and Redux, Javascript/Typescript.',\n", + " 'Contributed developing backend utilizing Django/Python.']},\n", + " {'organization': 'FREELANCE',\n", + " 'project_name': 'Various Web and Mobile Applications',\n", + " 'location': None,\n", + " 'start_date': '2017-09-01',\n", + " 'end_date': '2021-10-31',\n", + " 'descriptions': ['Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", + " 'Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance.',\n", + " 'Developed cross-platform mobile application using Flutter and Ionic/Angular.',\n", + " 'Developed NFT marketplace websites and wrote smart contracts.']},\n", + " {'organization': 'NEUSOFT',\n", + " 'project_name': 'Server Administration and Java Development',\n", + " 'location': None,\n", + " 'start_date': '2014-06-01',\n", + " 'end_date': '2017-08-31',\n", + " 'descriptions': ['Worked as intern and software developer after graduated university.',\n", + " 'Correct analytical and reasoning skills to troubleshoot and repair server issues.',\n", + " 'Operating Systems & Security Software.',\n", + " 'Java / Spring Boot / Hibernate.']}]},\n", + " 'Time Elapsed: 17.22 seconds')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(json_result)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/sample_data/resume_1.pdf b/examples/sample_data/resume_1.pdf new file mode 100644 index 0000000..b914f8e Binary files /dev/null and b/examples/sample_data/resume_1.pdf differ diff --git a/examples/sample_data/resume_1.png b/examples/sample_data/resume_1.png new file mode 100644 index 0000000..6a1e276 Binary files /dev/null and b/examples/sample_data/resume_1.png differ diff --git a/examples/sample_data/test_1figure_1table.png b/examples/sample_data/test_1figure_1table.png new file mode 100644 index 0000000..039c8d1 Binary files /dev/null and b/examples/sample_data/test_1figure_1table.png differ