Skip to content

Commit

Permalink
add async extract resume, change name to extract_resume_key_value t…
Browse files Browse the repository at this point in the history
…o match cambio-core
  • Loading branch information
jojortz committed Nov 2, 2024
1 parent 20452ff commit 2a98e4d
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 18 deletions.
57 changes: 56 additions & 1 deletion any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def extract_key_value(
else:
return f"Error: {response.status_code} {response.text}", None

def resume_extract(
def extract_resume_key_value(
self,
file_path: str,
extract_type: Dict,
Expand Down Expand Up @@ -356,6 +356,59 @@ def async_extract_key_value(
# If response successful, upload the file
return upload_file_to_presigned_url(file_path, response)

def async_extract_resume_key_value(
self,
file_path: str,
extract_type: str,
) -> str:
"""Extract key-value pairs from a resume asynchronously.
Args:
file_path (str): The path to the file to be parsed.
extract_type (str): The type of extraction to be performed. It can be one of the following:
- "education": Education
- "work_experience": Work Experience
- "personal_info": Personal Information
- "skills": Skills
- "certifications": Certifications
- "projects": Projects
- "pii": Personally Identifiable Information - includes only name, email, and phone
Returns:
str: The file id of the uploaded file.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = check_file_type_and_path(file_path, file_extension)
if error:
return error, None

error = check_resume_extract_type(extract_type)
if error:
return error, None

file_name = Path(file_path).name

# Create the JSON payload
payload = {
"file_name": file_name,
"process_type": "resume_extract",
"extract_args": {
"extract_type": extract_type,
},
}

# Send the POST request
response = requests.post(
self._async_upload_url,
headers=self._headers,
data=json.dumps(payload),
timeout=TIMEOUT,
)

# If response successful, upload the file
return upload_file_to_presigned_url(file_path, response)

def async_fetch(
self,
file_id: str,
Expand Down Expand Up @@ -407,6 +460,8 @@ def async_fetch(
result = response.json()
if "json" in result:
return result["json"]
elif "resume_extraction" in result:
return result["resume_extraction"]
elif "markdown" in result:
markdown_list = result["markdown"]
return "\n".join(markdown_list)
Expand Down
2 changes: 1 addition & 1 deletion examples/pdf_resume_to_json.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@
"ap = AnyParser(example_apikey)\n",
"\n",
"# extract returns a tuple containing the markdown as a string and total time\n",
"resume_extract_result, total_time = ap.resume_extract(example_local_file, extract_type=\"work_experience\")\n",
"resume_extract_result, total_time = ap.extract_resume_key_value(example_local_file, extract_type=\"work_experience\")\n",
"\n",
"display(resume_extract_result)\n",
"print(total_time)"
Expand Down
52 changes: 36 additions & 16 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,15 @@ def test_async_extract_key_value_and_fetch(self):
# wait 1 s between requests
time.sleep(1)

def test_sync_extract_resume(self):
def test_sync_extract_resume_key_value(self):
"""Synchronous Resume Extraction with subtests for different file formats"""
for data in EXTRACT_RESUME_TEST_DATA:
for extract_type in data["correct_output"]:
with self.subTest(
working_file=data["working_file"], extract_type=extract_type
):
# extract
key_value_result, elapsed_time = self.ap.resume_extract(
key_value_result, elapsed_time = self.ap.extract_resume_key_value(
data["working_file"], extract_type=extract_type
)
print("\n\n Key Value Result: ")
Expand All @@ -228,25 +228,45 @@ def test_sync_extract_resume(self):
str(key_value_result), str(data["correct_output"][extract_type])
)

# TODO: Fix these extract_type to properly output the correct output
if extract_type in [
"education",
"skills",
"certifications",
"projects",
]:
print(f"Percentage: {percentage:.2f}%")
else:
self.assertGreaterEqual(
percentage,
90,
f"Output similarity too low: {percentage:.2f}%",
)
self.assertGreaterEqual(
percentage,
80,
f"Output similarity too low: {percentage:.2f}%",
)

self.assertIn("Time Elapsed", elapsed_time)
# wait 1 s between requests
time.sleep(1)

def test_async_extract_resume_key_value_and_fetch(self):
"""Asynchronous Resume Extraction and Fetch"""
for data in EXTRACT_RESUME_TEST_DATA:
for extract_type in data["correct_output"]:
with self.subTest(
working_file=data["working_file"], extract_type=extract_type
):
# extract
file_id = self.ap.async_extract_resume_key_value(
data["working_file"], extract_type=extract_type
)
self.assertFalse(file_id.startswith("Error:"), file_id)
# fetch
extract_resume_result = self.ap.async_fetch(file_id=file_id)
# TODO: update with proper value checking
# get levenshtein distance from string of correct output vs. key value result
percentage = compare_markdown(
str(extract_resume_result),
str(data["correct_output"][extract_type]),
)

self.assertGreaterEqual(
percentage,
80,
f"Output similarity too low: {percentage:.2f}%",
)
# wait 1 s between requests
time.sleep(1)


if __name__ == "__main__":
unittest.main(verbosity=2)

0 comments on commit 2a98e4d

Please sign in to comment.