diff --git a/mint.json b/mint.json index 8f8e4e8..d0380e7 100644 --- a/mint.json +++ b/mint.json @@ -61,6 +61,8 @@ "pages": [ "sdk-reference/introduction", "sdk-reference/parse", + "sdk-reference/parse_with_ocr", + "sdk-reference/parse_with_layout", "sdk-reference/extract_key_value", "sdk-reference/extract_resume_key_value", "sdk-reference/extract_pii", diff --git a/quickstart.mdx b/quickstart.mdx index 1866307..0921b46 100644 --- a/quickstart.mdx +++ b/quickstart.mdx @@ -35,7 +35,7 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -md_output, total_time = ap.extract(file_path="./data/test.pdf") +md_output, total_time = ap.extract(file_path="/path/to/your/file") ``` ```python anyparser_async.py @@ -43,7 +43,7 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -file_id = ap.async_extract(file_path="./data/test.pdf") +file_id = ap.async_extract(file_path="/path/to/your/file") md = ap.async_fetch(file_id=file_id) ``` diff --git a/sdk-reference/extract_key_value.mdx b/sdk-reference/extract_key_value.mdx index abb351c..f1254c2 100644 --- a/sdk-reference/extract_key_value.mdx +++ b/sdk-reference/extract_key_value.mdx @@ -29,7 +29,7 @@ Finally, use the `extract_key_value` method, passing in the following: - `extract_instruction` (dict): keys to extract and their descriptions ```python anyparser_extract_key_value.py -key_value_result, total_time = ap.extract_key_value(local_file_path, extract_instruction) +key_value_result, total_time = ap.extract_key_value(file_path="/path/to/your/file", extract_instruction) ``` This will return two things: @@ -48,9 +48,13 @@ extract_instruction = { "ein": "the employer identification number", } -key_value_result, total_time = ap.extract_key_value(local_file_path, extract_instruction) +key_value_result, total_time = ap.extract_key_value(file_path="/path/to/your/file", extract_instruction) ``` +## Output + +A dictionary where the keys are the same as those in the input dictionary, and the values are the corresponding extracted results. + ## Full Notebook Examples Check out these notebooks for more detailed examples of using both sync and async AnyParser. diff --git a/sdk-reference/extract_pii.mdx b/sdk-reference/extract_pii.mdx index d4a3bcd..6aa2611 100644 --- a/sdk-reference/extract_pii.mdx +++ b/sdk-reference/extract_pii.mdx @@ -1,10 +1,10 @@ --- title: 'Extract PII' -description: 'Extract Personally Identifiable Information (PII) information from your documents' +description: 'Extract Personally Identifiable Information (PII) from your documents' --- ## Overview -Using `AnyParser`, you can extract PII information from your documents, including +Using `AnyParser`, you can extract PII from your documents, including - Name - Phone Number @@ -29,7 +29,7 @@ Then, use the `anyparser_pii` method, passing in the following: - `file_path` (str): the path to the local file ```python anyparser_extract_pii.py -pii_result, total_time = ap.extract_pii(local_file_path) +pii_result, total_time = ap.extract_pii(file_path="/path/to/your/file") ``` This will return two things: @@ -48,6 +48,10 @@ local_file_path = "/path/to/your/file" pii_result, total_time = ap.extract_pii(local_file_path) ``` +## Output + +A dictionary containing Personally Identifiable Information (PII). + ## Full Notebook Examples Check out these notebooks for more detailed examples of using both sync and async AnyParser. diff --git a/sdk-reference/extract_resume_key_value.mdx b/sdk-reference/extract_resume_key_value.mdx index 8257cf9..188ad5b 100644 --- a/sdk-reference/extract_resume_key_value.mdx +++ b/sdk-reference/extract_resume_key_value.mdx @@ -29,7 +29,7 @@ Then, use the `extract_resume_key_value` method, passing in the following: - `file_path` (str): the path to the local file ```python anyparser_extract_resume_key_value.py -resume_result, total_time = ap.extract_resume_key_value(local_file_path) +resume_result, total_time = ap.extract_resume_key_value(file_path="/path/to/your/file") ``` This will return two things: @@ -43,11 +43,13 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -local_file_path = "/path/to/your/file" - -key_value_result, total_time = ap.extract_resume_key_value(local_file_path) +key_value_result, total_time = ap.extract_resume_key_value(file_path="/path/to/your/file") ``` +# Output + +A dictionary of containing resume information. + ## Full Notebook Examples Check out these notebooks for more detailed examples of using both sync and async AnyParser. diff --git a/sdk-reference/extract_tables.mdx b/sdk-reference/extract_tables.mdx index e4510ec..64f86fd 100644 --- a/sdk-reference/extract_tables.mdx +++ b/sdk-reference/extract_tables.mdx @@ -35,11 +35,13 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -local_file_path = "/path/to/your/file" - -pii_result, total_time = ap.extract_tables(local_file_path) +table_result, total_time = ap.extract_tables(file_path="/path/to/your/file") ``` +## Output + +A string containing all the tables in HTML format. + ## Full Notebook Examples Check out these notebooks for more detailed examples of using both sync and async AnyParser. diff --git a/sdk-reference/parse.mdx b/sdk-reference/parse.mdx index 76a67b2..871a103 100644 --- a/sdk-reference/parse.mdx +++ b/sdk-reference/parse.mdx @@ -16,7 +16,7 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -md_output, total_time = ap.parse(file_path="./data/test.pdf") +md_output, total_time = ap.parse(file_path="/path/to/your/file") ``` ```python anyparser_async.py @@ -24,42 +24,20 @@ from any_parser import AnyParser ap = AnyParser(api_key="...") -file_id = ap.async_parse(file_path="./data/test.pdf") +file_id = ap.async_parse(file_path="/path/to/your/file") md = ap.async_fetch(file_id=file_id) ``` -## Advanced Parsing +## Output -The `parse` API will invoke a general-purpose model that is optimized for a wide range of document types. -CambioML also provides more advanced models that is optimized for more complex documents. - - -### Async Parse with OCR - -The `Parse with OCR` model refines parsing results by applying OCR detection and correction techniques. -```python -# start the parsing request -file_id = ap.async_parse_with_ocr(example_local_file) -# fetch results (5s polling up to 60s) -markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5) -``` - -### Async Parse with Layout Model -The `Parse with Layout` model first analyzes the layout information of the file, -then processes each element separately using specialized models tailored for different content types. -```python -# start the parsing request -file_id = ap.async_parse_with_layout(example_local_file) -# fetch results (5s polling up to 60s) -markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5) -``` +A string containing the markdown representation of the given file. ## Full Notebook Examples Check out these notebooks for more detailed examples of using AnyParser BASE and PRO models: -- [AnyParser Sync API](https://github.com/CambioML/any-parser/blob/main/examples/pdf_to_markdown.ipynb): Parse 1-2 pages short documents (which will time out after 30 seconds). -- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_pdf_to_markdown.ipynb): Parse longer documents (which may take longer than 30 seconds). +- [AnyParser Sync API](https://github.com/CambioML/any-parser/blob/main/examples/parse_pdf.ipynb): Parse 1-2 pages short documents (which will time out after 30 seconds). +- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_parse_pdf.ipynb): Parse longer documents (which may take longer than 30 seconds). + + Extracting content from a table of contents. + + \ No newline at end of file diff --git a/sdk-reference/parse_with_ocr.mdx b/sdk-reference/parse_with_ocr.mdx new file mode 100644 index 0000000..f75142b --- /dev/null +++ b/sdk-reference/parse_with_ocr.mdx @@ -0,0 +1,43 @@ +--- +title: 'Parse With OCR' +description: 'Parse the full content from your documents into markdown format.' +--- + +## Overview +Using `AnyParser`, you can parse the full content from your documents into markdown. +The `Parse with OCR` model refines parsing results by applying OCR detection and correction techniques. + +## Setup +Refer to the [Quickstart guide](/quickstart/#setup) to install the AnyParser SDK and get your api key. + +Next, set up your `AnyParser` sync or async client. + + +```python anyparser_async.py +from any_parser import AnyParser + +ap = AnyParser(api_key="...") +# start the parsing request +file_id = ap.async_parse_with_ocr(file_path="/path/to/your/file") +# fetch results (5s polling up to 60s) +markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5) +``` + +## Output + +A string containing the markdown representation of the given file. + +## Full Notebook Examples + +Check out these notebooks for more detailed examples of using AnyParser BASE and PRO models: +- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_parse_with_ocr.ipynb): Parse longer documents (which may take longer than 30 seconds). + + + + Extracting content from a table of contents. + + \ No newline at end of file