add async extract resume, change name to extract_resume_key_value t…

…o match cambio-core
CambioML · Nov 2, 2024 · 2a98e4d · 2a98e4d
1 parent 20452ff
commit 2a98e4d
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 18 deletions.
diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
@@ -189,7 +189,7 @@ def extract_key_value(
         else:
             return f"Error: {response.status_code} {response.text}", None
 
-    def resume_extract(
+    def extract_resume_key_value(
         self,
         file_path: str,
         extract_type: Dict,
@@ -356,6 +356,59 @@ def async_extract_key_value(
         # If response successful, upload the file
         return upload_file_to_presigned_url(file_path, response)
 
+    def async_extract_resume_key_value(
+        self,
+        file_path: str,
+        extract_type: str,
+    ) -> str:
+        """Extract key-value pairs from a resume asynchronously.
+
+        Args:
+            file_path (str): The path to the file to be parsed.
+            extract_type (str): The type of extraction to be performed. It can be one of the following:
+                - "education": Education
+                - "work_experience": Work Experience
+                - "personal_info": Personal Information
+                - "skills": Skills
+                - "certifications": Certifications
+                - "projects": Projects
+                - "pii": Personally Identifiable Information - includes only name, email, and phone
+        Returns:
+            str: The file id of the uploaded file.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+
+        # Check if the file exists and file_type
+        error = check_file_type_and_path(file_path, file_extension)
+        if error:
+            return error, None
+
+        error = check_resume_extract_type(extract_type)
+        if error:
+            return error, None
+
+        file_name = Path(file_path).name
+
+        # Create the JSON payload
+        payload = {
+            "file_name": file_name,
+            "process_type": "resume_extract",
+            "extract_args": {
+                "extract_type": extract_type,
+            },
+        }
+
+        # Send the POST request
+        response = requests.post(
+            self._async_upload_url,
+            headers=self._headers,
+            data=json.dumps(payload),
+            timeout=TIMEOUT,
+        )
+
+        # If response successful, upload the file
+        return upload_file_to_presigned_url(file_path, response)
+
     def async_fetch(
         self,
         file_id: str,
@@ -407,6 +460,8 @@ def async_fetch(
             result = response.json()
             if "json" in result:
                 return result["json"]
+            elif "resume_extraction" in result:
+                return result["resume_extraction"]
             elif "markdown" in result:
                 markdown_list = result["markdown"]
                 return "\n".join(markdown_list)

diff --git a/examples/pdf_resume_to_json.ipynb b/examples/pdf_resume_to_json.ipynb
@@ -215,7 +215,7 @@
     "ap = AnyParser(example_apikey)\n",
     "\n",
     "# extract returns a tuple containing the markdown as a string and total time\n",
-    "resume_extract_result, total_time = ap.resume_extract(example_local_file, extract_type=\"work_experience\")\n",
+    "resume_extract_result, total_time = ap.extract_resume_key_value(example_local_file, extract_type=\"work_experience\")\n",
     "\n",
     "display(resume_extract_result)\n",
     "print(total_time)"

diff --git a/tests/test.py b/tests/test.py
@@ -206,15 +206,15 @@ def test_async_extract_key_value_and_fetch(self):
                 # wait 1 s between requests
                 time.sleep(1)
 
-    def test_sync_extract_resume(self):
+    def test_sync_extract_resume_key_value(self):
         """Synchronous Resume Extraction with subtests for different file formats"""
         for data in EXTRACT_RESUME_TEST_DATA:
             for extract_type in data["correct_output"]:
                 with self.subTest(
                     working_file=data["working_file"], extract_type=extract_type
                 ):
                     # extract
-                    key_value_result, elapsed_time = self.ap.resume_extract(
+                    key_value_result, elapsed_time = self.ap.extract_resume_key_value(
                         data["working_file"], extract_type=extract_type
                     )
                     print("\n\n Key Value Result: ")
@@ -228,25 +228,45 @@ def test_sync_extract_resume(self):
                         str(key_value_result), str(data["correct_output"][extract_type])
                     )
 
-                    # TODO: Fix these extract_type to properly output the correct output
-                    if extract_type in [
-                        "education",
-                        "skills",
-                        "certifications",
-                        "projects",
-                    ]:
-                        print(f"Percentage: {percentage:.2f}%")
-                    else:
-                        self.assertGreaterEqual(
-                            percentage,
-                            90,
-                            f"Output similarity too low: {percentage:.2f}%",
-                        )
+                    self.assertGreaterEqual(
+                        percentage,
+                        80,
+                        f"Output similarity too low: {percentage:.2f}%",
+                    )
 
                     self.assertIn("Time Elapsed", elapsed_time)
                     # wait 1 s between requests
                     time.sleep(1)
 
+    def test_async_extract_resume_key_value_and_fetch(self):
+        """Asynchronous Resume Extraction and Fetch"""
+        for data in EXTRACT_RESUME_TEST_DATA:
+            for extract_type in data["correct_output"]:
+                with self.subTest(
+                    working_file=data["working_file"], extract_type=extract_type
+                ):
+                    # extract
+                    file_id = self.ap.async_extract_resume_key_value(
+                        data["working_file"], extract_type=extract_type
+                    )
+                    self.assertFalse(file_id.startswith("Error:"), file_id)
+                    # fetch
+                    extract_resume_result = self.ap.async_fetch(file_id=file_id)
+                    # TODO: update with proper value checking
+                    # get levenshtein distance from string of correct output vs. key value result
+                    percentage = compare_markdown(
+                        str(extract_resume_result),
+                        str(data["correct_output"][extract_type]),
+                    )
+
+                    self.assertGreaterEqual(
+                        percentage,
+                        80,
+                        f"Output similarity too low: {percentage:.2f}%",
+                    )
+                    # wait 1 s between requests
+                    time.sleep(1)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)