Skip to content

Commit

Permalink
Merge branch 'main' into extract-resume
Browse files Browse the repository at this point in the history
  • Loading branch information
jojortz committed Oct 25, 2024
2 parents 2636f64 + cde5b95 commit ad0fc39
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 137 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

## :seedling: Set up your AnyParser API key

To get started, generate your API key from the [Playground Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.
To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.

> ⚠️ **Note:** The free API is limited to 10 pages/call.
Expand All @@ -25,18 +25,18 @@ CAMBIO_API_KEY=0cam************************

## :computer: Installation
### 1. Set Up a New Conda Environment and Install AnyParser
First, create and activate a new Conda environment, then install AnyParser:
First, create and activate a new Conda environment, then install AnyParser:
```bash
conda create -n any-parse python=3.10 -y
conda activate any-parse
pip3 install any-parser
```
### 2. Create an AnyParser Instance Using Your API Key
Use your API key to create an instance of AnyParserRT. Make sure you’ve set up your .env file to store your API key securely:
Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely:
```python
import os
from dotenv import load_dotenv
from any_parser import AnyParserRT # Import the AnyParserRT class
from any_parser import AnyParser

# Load environment variables
load_dotenv(override=True)
Expand All @@ -45,7 +45,7 @@ load_dotenv(override=True)
example_apikey = os.getenv("CAMBIO_API_KEY")

# Create an AnyParser instance
ap = AnyParserRT(api_key=example_apikey)
ap = AnyParser(api_key=example_apikey)
```

### 3. Run Synchronous Extraction
Expand Down Expand Up @@ -73,4 +73,3 @@ Are you an AI engineer looking to **accurately** extract both the text and layou

### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb)
Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb).

2 changes: 1 addition & 1 deletion any_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

__all__ = ["AnyParser", "ModelType"]

__version__ = "0.0.16"
__version__ = "0.0.17"
137 changes: 39 additions & 98 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,15 @@

import requests

from any_parser.utils import (
ModelType,
check_file_type_and_path,
check_model,
upload_file_to_presigned_url,
)

PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com"
TIMEOUT = 60
SUPPORTED_FILE_EXTENSIONS = [
"pdf",
"doc",
"docx",
"ppt",
"pptx",
"jpg",
"jpeg",
"png",
"gif",
]
RESULT_TYPES = ["markdown", "json"]
RESUME_EXTRACT_TYPES = [
"education",
"work_experience",
"personal_info",
"skills",
"certifications",
"projects",
]


class ModelType(Enum):
BASE = "base"
PRO = "pro"


class ProcessType(Enum):
Expand Down Expand Up @@ -76,23 +58,28 @@ def extract(
model: ModelType = ModelType.BASE,
extract_args: Optional[Dict] = None,
) -> Tuple[str, str]:
"""Extract data in real-time.
"""Extract full content from a file in real-time.
Args:
file_path (str): The path to the file to be parsed.
extract_args (Optional[Dict]): Additional extraction arguments added to prompt
model (ModelType): The model to use for extraction. Can be
`ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`.
extract_args (Optional[Dict]): Additional extraction arguments added
to the prompt.
Returns:
tuple(str, str): The extracted data and the time taken.
"""

file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
error = check_file_type_and_path(file_path, file_extension)

if error:
return error, None

error = self._check_model(model)
error = check_model(model)
if error:
return error, None

Expand Down Expand Up @@ -145,24 +132,24 @@ def extract(
else:
return f"Error: {response.status_code} {response.text}", None

def extract_json(
def extract_key_value(
self,
file_path: str,
extract_instruction: Dict,
) -> Tuple[str, str]:
"""Extract json in real-time.
"""Extract key-value pairs from a file in real-time.
Args:
file_path (str): The path to the file to be parsed.
extract_instruction (Dict): A dictionary containing the keys to be extracted,
with their values as the description of those keys.
extract_instruction (Dict): A dictionary containing the keys to be
extracted, with their values as the description of those keys.
Returns:
tuple(str, str): The extracted data and the time taken.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
error = check_file_type_and_path(file_path, file_extension)
if error:
return error, None

Expand Down Expand Up @@ -274,23 +261,25 @@ def async_extract(
model: ModelType = ModelType.BASE,
extract_args: Optional[Dict] = None,
) -> str:
"""Extract data asynchronously.
"""Extract full content from a file asynchronously.
Args:
file_path (str): The path to the file to be parsed.
model (ModelType): The model to use for extraction. Can be
`ModelType.BASE` or `ModelType.PRO`. Defaults to `ModelType.BASE`.
extract_args (Optional[Dict]): Additional extraction arguments added to prompt
Returns:
str: The file id of the uploaded file.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
error = check_file_type_and_path(file_path, file_extension)

if error:
return error, None

error = self._check_model(model)
error = check_model(model)
if error:
return error, None

Expand Down Expand Up @@ -321,26 +310,26 @@ def async_extract(
)

# If response successful, upload the file
return self._upload_file_to_presigned_url(file_path, response)
return upload_file_to_presigned_url(file_path, response)

def async_extract_json(
def async_extract_key_value(
self,
file_path: str,
extract_instruction: Dict,
) -> str:
"""Extract data asynchronously.
"""Extract key-value pairs from a file asynchronously.
Args:
file_path (str): The path to the file to be parsed.
extract_instruction (Dict): A dictionary containing the keys to be extracted,
with their values as the description of those keys.
extract_instruction (Dict): A dictionary containing the keys to be
extracted, with their values as the description of those keys.
Returns:
str: The file id of the uploaded file.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
error = check_file_type_and_path(file_path, file_extension)

if error:
return error, None
Expand All @@ -363,15 +352,14 @@ def async_extract_json(
)

# If response successful, upload the file
return self._upload_file_to_presigned_url(file_path, response)
return upload_file_to_presigned_url(file_path, response)

def async_fetch(
self,
file_id: str,
sync: bool = True,
sync_timeout: int = 60,
sync_interval: int = 5,
result_type: str = "markdown",
) -> str:
"""Fetches extraction results asynchronously.
Expand All @@ -380,13 +368,11 @@ def async_fetch(
sync (bool, optional): Whether to wait for the results synchronously.
sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60.
sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5.
result_type (string, optional): The type of result to fetch. Defaults to `markdown`.
Returns:
str: The extracted results as a markdown string.
None: If the extraction is still in progress (when sync is False).
"""
self._check_result_type(result_type)

response = None
# Create the JSON payload
Expand Down Expand Up @@ -416,58 +402,13 @@ def async_fetch(
if response is None:
return "Error: timeout, no response received"
if response.status_code == 200:
if result_type == "json":
return response.json()["json"]
else:
markdown_list = response.json()["markdown"]
result = response.json()
if "json" in result:
return result["json"]
elif "markdown" in result:
markdown_list = result["markdown"]
return "\n".join(markdown_list)
return f"Error: Invalid response format\n {result}"
if response.status_code == 202:
return None
return f"Error: {response.status_code} {response.text}"

def _upload_file_to_presigned_url(
self, file_path: str, response: requests.Response
) -> str:
if response.status_code == 200:
try:
file_id = response.json().get("fileId")
presigned_url = response.json().get("presignedUrl")
with open(file_path, "rb") as file_to_upload:
files = {"file": (file_path, file_to_upload)}
upload_resp = requests.post(
presigned_url["url"],
data=presigned_url["fields"],
files=files,
timeout=TIMEOUT,
)
if upload_resp.status_code != 204:
return f"Error: {upload_resp.status_code} {upload_resp.text}"
return file_id
except json.JSONDecodeError:
return "Error: Invalid JSON response"
else:
return f"Error: {response.status_code} {response.text}"

def _check_model(self, model: ModelType) -> None:
if model not in {ModelType.BASE, ModelType.PRO}:
valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
return f"Invalid model type: {model}. Supported `model` types include {valid_models}."

def _check_file_type_and_path(self, file_path, file_extension):
# Check if the file exists
if not Path(file_path).is_file():
return f"Error: File does not exist: {file_path}"

if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."

def _check_result_type(self, result_type: str) -> None:
if result_type not in RESULT_TYPES:
valid_result_types = ", ".join(RESULT_TYPES)
return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}."

def _check_resume_extract_type(self, extract_type: str) -> None:
if extract_type not in RESUME_EXTRACT_TYPES:
valid_extract_types = ", ".join(RESUME_EXTRACT_TYPES)
return f"Invalid extract type: {extract_type}. Supported `extract_type` types include {valid_extract_types}."
63 changes: 63 additions & 0 deletions any_parser/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
from enum import Enum
from pathlib import Path

import requests


class ModelType(Enum):
BASE = "base"
PRO = "pro"


SUPPORTED_FILE_EXTENSIONS = [
"pdf",
"doc",
"docx",
"ppt",
"pptx",
"jpg",
"jpeg",
"png",
"gif",
]


def upload_file_to_presigned_url(
file_path: str, response: requests.Response, timeout: int = 10
) -> str:
if response.status_code == 200:
try:
file_id = response.json().get("fileId")
presigned_url = response.json().get("presignedUrl")
with open(file_path, "rb") as file_to_upload:
files = {"file": (file_path, file_to_upload)}
upload_resp = requests.post(
presigned_url["url"],
data=presigned_url["fields"],
files=files,
timeout=timeout,
)
if upload_resp.status_code != 204:
return f"Error: {upload_resp.status_code} {upload_resp.text}"
return file_id
except json.JSONDecodeError:
return "Error: Invalid JSON response"
else:
return f"Error: {response.status_code} {response.text}"


def check_model(model: ModelType) -> None:
if model not in {ModelType.BASE, ModelType.PRO}:
valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
return f"Invalid model type: {model}. Supported `model` types include {valid_models}."


def check_file_type_and_path(file_path, file_extension):
# Check if the file exists
if not Path(file_path).is_file():
return f"Error: File does not exist: {file_path}"

if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
Loading

0 comments on commit ad0fc39

Please sign in to comment.