Skip to content

Commit

Permalink
update resume_extract to use utils.py, update tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jojortz committed Oct 25, 2024
1 parent ad0fc39 commit 8e58501
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 14 deletions.
6 changes: 4 additions & 2 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ModelType,
check_file_type_and_path,
check_model,
check_resume_extract_type,
upload_file_to_presigned_url,
)

Expand Down Expand Up @@ -204,17 +205,18 @@ def resume_extract(
- "skills": Skills
- "certifications": Certifications
- "projects": Projects
- "pii": Personally Identifiable Information - includes only name, email, and phone
Returns:
tuple(str, str): The extracted data and the time taken.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
error = check_file_type_and_path(file_path, file_extension)
if error:
return error, None

error = self._check_resume_extract_type(extract_type)
error = check_resume_extract_type(extract_type)
if error:
return error, None

Expand Down
17 changes: 17 additions & 0 deletions any_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@ class ModelType(Enum):
"gif",
]

SUPPORTED_RESUME_EXTRACT_TYPES = [
"pii",
"education",
"work_experience",
"personal_info",
"skills",
"certifications",
"projects",
]


def upload_file_to_presigned_url(
file_path: str, response: requests.Response, timeout: int = 10
Expand Down Expand Up @@ -61,3 +71,10 @@ def check_file_type_and_path(file_path, file_extension):
if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."


def check_resume_extract_type(extract_type):
# Check if the extract type is supported for resume_extract
if extract_type not in SUPPORTED_RESUME_EXTRACT_TYPES:
supported_types = ", ".join(SUPPORTED_RESUME_EXTRACT_TYPES)
return f"Error: Unsupported resume extract type: {extract_type}. Supported extract types include {supported_types}."
13 changes: 7 additions & 6 deletions examples/pdf_resume_to_json.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@
"# !pip3 install --upgrade any-parser\n",
"\n",
"# Option 2: if you have sdk respository installed locally, add system path\n",
"import sys\n",
"sys.path.append(\".\")\n",
"sys.path.append(\"..\")\n",
"sys.path.append(\"../..\")"
"# import sys\n",
"# sys.path.append(\".\")\n",
"# sys.path.append(\"..\")\n",
"# sys.path.append(\"../..\")"
]
},
{
Expand Down Expand Up @@ -157,7 +157,8 @@
"We need to specify 2 parameters:\n",
"- `file_path`: The path to the PDF file.\n",
"- `extract_type`: This specifies the type of information that you want to extract from the resume. It can be one of the following:\n",
" - `personal_info`: Extracts personal information from the resume, such as name, email, phone number, etc.\n",
" - `pii`: Extracts name, email, and phone number.\n",
" - `personal_info`: Extracts personal information from the resume, all the same as `pii` plus things like LinkedIn and Github urls.\n",
" - `education`: Extracts educational information from the resume, such as degree, university, graduation year, etc.\n",
" - `work_experience`: Extracts work experience information from the resume, such as company name, job title, start date, end date, etc.\n",
" - `skills`: Extracts skills information from the resume. Categorized into 'Programming Languages', 'Tools', and 'Other'\n",
Expand Down Expand Up @@ -206,7 +207,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Time Elapsed: 13.96 seconds\n"
"Time Elapsed: 10.40 seconds\n"
]
}
],
Expand Down
22 changes: 16 additions & 6 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,6 @@ def test_async_extract_key_value_and_fetch(self):
# wait 1 s between requests
time.sleep(1)

@unittest.skip(
"Skipping this test temporarily"
) # TODO: fix resume extract to pass this test
def test_sync_extract_resume(self):
"""Synchronous Resume Extraction with subtests for different file formats"""
for data in EXTRACT_RESUME_TEST_DATA:
Expand All @@ -225,13 +222,26 @@ def test_sync_extract_resume(self):
print("\n\n Correct Output: ")
print(data["correct_output"][extract_type])

# TODO: update with proper value checking
# get levenshtein distance from string of correct output vs. key value result
percentage = compare_markdown(
str(key_value_result), str(data["correct_output"][extract_type])
)
self.assertGreaterEqual(
percentage, 90, f"Output similarity too low: {percentage:.2f}%"
)

# TODO: Fix these extract_type to properly output the correct output
if extract_type in [
"education",
"skills",
"certifications",
"projects",
]:
print(f"Percentage: {percentage:.2f}%")
else:
self.assertGreaterEqual(
percentage,
90,
f"Output similarity too low: {percentage:.2f}%",
)

self.assertIn("Time Elapsed", elapsed_time)
# wait 1 s between requests
Expand Down
5 changes: 5 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@
{
"working_file": "./examples/sample_data/test_resume.pdf",
"correct_output": {
"pii": {
"full_name": "John Doe",
"email": "[email protected]",
"phone": "(123) 456-7890",
},
"personal_info": {
"name": "John Doe",
"phone_number": "+1-123-456-7890",
Expand Down

0 comments on commit 8e58501

Please sign in to comment.