From 8e58501af8cbf6cf5d66c81240d0ec6e7ae693a5 Mon Sep 17 00:00:00 2001
From: Jojo Ortiz <ortiz.jroman@gmail.com>
Date: Fri, 25 Oct 2024 12:57:11 -0700
Subject: [PATCH] update resume_extract to use utils.py, update tests

---
 any_parser/any_parser.py          |  6 ++++--
 any_parser/utils.py               | 17 +++++++++++++++++
 examples/pdf_resume_to_json.ipynb | 13 +++++++------
 tests/test.py                     | 22 ++++++++++++++++------
 tests/test_data.py                |  5 +++++
 5 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
index 32c6b27..72a8f14 100644
--- a/any_parser/any_parser.py
+++ b/any_parser/any_parser.py
@@ -13,6 +13,7 @@
     ModelType,
     check_file_type_and_path,
     check_model,
+    check_resume_extract_type,
     upload_file_to_presigned_url,
 )
 
@@ -204,17 +205,18 @@ def resume_extract(
                 - "skills": Skills
                 - "certifications": Certifications
                 - "projects": Projects
+                - "pii": Personally Identifiable Information - includes only name, email, and phone
         Returns:
             tuple(str, str): The extracted data and the time taken.
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
         # Check if the file exists and file_type
-        error = self._check_file_type_and_path(file_path, file_extension)
+        error = check_file_type_and_path(file_path, file_extension)
         if error:
             return error, None
 
-        error = self._check_resume_extract_type(extract_type)
+        error = check_resume_extract_type(extract_type)
         if error:
             return error, None
 
diff --git a/any_parser/utils.py b/any_parser/utils.py
index ed70fe1..b5927d3 100644
--- a/any_parser/utils.py
+++ b/any_parser/utils.py
@@ -22,6 +22,16 @@ class ModelType(Enum):
     "gif",
 ]
 
+SUPPORTED_RESUME_EXTRACT_TYPES = [
+    "pii",
+    "education",
+    "work_experience",
+    "personal_info",
+    "skills",
+    "certifications",
+    "projects",
+]
+
 
 def upload_file_to_presigned_url(
     file_path: str, response: requests.Response, timeout: int = 10
@@ -61,3 +71,10 @@ def check_file_type_and_path(file_path, file_extension):
     if file_extension not in SUPPORTED_FILE_EXTENSIONS:
         supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
         return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
+
+
+def check_resume_extract_type(extract_type):
+    # Check if the extract type is supported for resume_extract
+    if extract_type not in SUPPORTED_RESUME_EXTRACT_TYPES:
+        supported_types = ", ".join(SUPPORTED_RESUME_EXTRACT_TYPES)
+        return f"Error: Unsupported resume extract type: {extract_type}. Supported extract types include {supported_types}."
diff --git a/examples/pdf_resume_to_json.ipynb b/examples/pdf_resume_to_json.ipynb
index 1417fc7..60d2808 100644
--- a/examples/pdf_resume_to_json.ipynb
+++ b/examples/pdf_resume_to_json.ipynb
@@ -55,10 +55,10 @@
     "# !pip3 install --upgrade any-parser\n",
     "\n",
     "# Option 2: if you have sdk respository installed locally, add system path\n",
-    "import sys\n",
-    "sys.path.append(\".\")\n",
-    "sys.path.append(\"..\")\n",
-    "sys.path.append(\"../..\")"
+    "# import sys\n",
+    "# sys.path.append(\".\")\n",
+    "# sys.path.append(\"..\")\n",
+    "# sys.path.append(\"../..\")"
    ]
   },
   {
@@ -157,7 +157,8 @@
     "We need to specify 2 parameters:\n",
     "- `file_path`: The path to the PDF file.\n",
     "- `extract_type`: This specifies the type of information that you want to extract from the resume. It can be one of the following:\n",
-    "    - `personal_info`: Extracts personal information from the resume, such as name, email, phone number, etc.\n",
+    "    - `pii`: Extracts name, email, and phone number.\n",
+    "    - `personal_info`: Extracts personal information from the resume, all the same as `pii` plus things like LinkedIn and Github urls.\n",
     "    - `education`: Extracts educational information from the resume, such as degree, university, graduation year, etc.\n",
     "    - `work_experience`: Extracts work experience information from the resume, such as company name, job title, start date, end date, etc.\n",
     "    - `skills`: Extracts skills information from the resume. Categorized into 'Programming Languages', 'Tools', and 'Other'\n",
@@ -206,7 +207,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time Elapsed: 13.96 seconds\n"
+      "Time Elapsed: 10.40 seconds\n"
      ]
     }
    ],
diff --git a/tests/test.py b/tests/test.py
index c24b3d9..66b4d19 100755
--- a/tests/test.py
+++ b/tests/test.py
@@ -206,9 +206,6 @@ def test_async_extract_key_value_and_fetch(self):
                 # wait 1 s between requests
                 time.sleep(1)
 
-    @unittest.skip(
-        "Skipping this test temporarily"
-    )  # TODO: fix resume extract to pass this test
     def test_sync_extract_resume(self):
         """Synchronous Resume Extraction with subtests for different file formats"""
         for data in EXTRACT_RESUME_TEST_DATA:
@@ -225,13 +222,26 @@ def test_sync_extract_resume(self):
                     print("\n\n Correct Output: ")
                     print(data["correct_output"][extract_type])
 
+                    # TODO: update with proper value checking
                     # get levenshtein distance from string of correct output vs. key value result
                     percentage = compare_markdown(
                         str(key_value_result), str(data["correct_output"][extract_type])
                     )
-                    self.assertGreaterEqual(
-                        percentage, 90, f"Output similarity too low: {percentage:.2f}%"
-                    )
+
+                    # TODO: Fix these extract_type to properly output the correct output
+                    if extract_type in [
+                        "education",
+                        "skills",
+                        "certifications",
+                        "projects",
+                    ]:
+                        print(f"Percentage: {percentage:.2f}%")
+                    else:
+                        self.assertGreaterEqual(
+                            percentage,
+                            90,
+                            f"Output similarity too low: {percentage:.2f}%",
+                        )
 
                     self.assertIn("Time Elapsed", elapsed_time)
                     # wait 1 s between requests
diff --git a/tests/test_data.py b/tests/test_data.py
index c2a008b..e0d9047 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -81,6 +81,11 @@
     {
         "working_file": "./examples/sample_data/test_resume.pdf",
         "correct_output": {
+            "pii": {
+                "full_name": "John Doe",
+                "email": "johndoe@example.com",
+                "phone": "(123) 456-7890",
+            },
             "personal_info": {
                 "name": "John Doe",
                 "phone_number": "+1-123-456-7890",