update SDK reference

CambioML · Nov 25, 2024 · 59d3a0d · 59d3a0d
1 parent d06bf84
commit 59d3a0d
Show file tree

Hide file tree

Showing 9 changed files with 120 additions and 42 deletions.
diff --git a/mint.json b/mint.json
@@ -61,6 +61,8 @@
       "pages": [
         "sdk-reference/introduction",
         "sdk-reference/parse",
+        "sdk-reference/parse_with_ocr",
+        "sdk-reference/parse_with_layout",
         "sdk-reference/extract_key_value",
         "sdk-reference/extract_resume_key_value",
         "sdk-reference/extract_pii",

diff --git a/quickstart.mdx b/quickstart.mdx
@@ -35,15 +35,15 @@ from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-md_output, total_time = ap.extract(file_path="./data/test.pdf")
+md_output, total_time = ap.extract(file_path="/path/to/your/file")
 ```
 
 ```python anyparser_async.py
 from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-file_id = ap.async_extract(file_path="./data/test.pdf")
+file_id = ap.async_extract(file_path="/path/to/your/file")
 
 md = ap.async_fetch(file_id=file_id)
 ```

diff --git a/sdk-reference/extract_key_value.mdx b/sdk-reference/extract_key_value.mdx
@@ -29,7 +29,7 @@ Finally, use the `extract_key_value` method, passing in the following:
 - `extract_instruction` (dict): keys to extract and their descriptions
 
 ```python anyparser_extract_key_value.py
-key_value_result, total_time = ap.extract_key_value(local_file_path, extract_instruction)
+key_value_result, total_time = ap.extract_key_value(file_path="/path/to/your/file", extract_instruction)
 ```
 
 This will return two things:
@@ -48,9 +48,13 @@ extract_instruction = {
                 "ein": "the employer identification number",
             }
 
-key_value_result, total_time = ap.extract_key_value(local_file_path, extract_instruction)
+key_value_result, total_time = ap.extract_key_value(file_path="/path/to/your/file", extract_instruction)
 ```
 
+## Output
+
+A dictionary where the keys are the same as those in the input dictionary, and the values are the corresponding extracted results.
+
 ## Full Notebook Examples
 
 Check out these notebooks for more detailed examples of using both sync and async AnyParser.

diff --git a/sdk-reference/extract_pii.mdx b/sdk-reference/extract_pii.mdx
@@ -1,10 +1,10 @@
 ---
 title: 'Extract PII'
-description: 'Extract Personally Identifiable Information (PII) information from your documents'
+description: 'Extract Personally Identifiable Information (PII) from your documents'
 ---
 
 ## Overview
-Using `AnyParser`, you can extract PII information from your documents, including
+Using `AnyParser`, you can extract PII from your documents, including
 
 - Name
 - Phone Number
@@ -29,7 +29,7 @@ Then, use the `anyparser_pii` method, passing in the following:
 - `file_path` (str): the path to the local file
 
 ```python anyparser_extract_pii.py
-pii_result, total_time = ap.extract_pii(local_file_path)
+pii_result, total_time = ap.extract_pii(file_path="/path/to/your/file")
 ```
 
 This will return two things:
@@ -48,6 +48,10 @@ local_file_path = "/path/to/your/file"
 pii_result, total_time = ap.extract_pii(local_file_path)
 ```
 
+## Output
+
+A dictionary containing Personally Identifiable Information (PII).
+
 ## Full Notebook Examples
 
 Check out these notebooks for more detailed examples of using both sync and async AnyParser.

diff --git a/sdk-reference/extract_resume_key_value.mdx b/sdk-reference/extract_resume_key_value.mdx
@@ -29,7 +29,7 @@ Then, use the `extract_resume_key_value` method, passing in the following:
 - `file_path` (str): the path to the local file
 
 ```python anyparser_extract_resume_key_value.py
-resume_result, total_time = ap.extract_resume_key_value(local_file_path)
+resume_result, total_time = ap.extract_resume_key_value(file_path="/path/to/your/file")
 ```
 
 This will return two things:
@@ -43,11 +43,13 @@ from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-local_file_path = "/path/to/your/file"
-
-key_value_result, total_time = ap.extract_resume_key_value(local_file_path)
+key_value_result, total_time = ap.extract_resume_key_value(file_path="/path/to/your/file")
 ```
 
+# Output
+
+A dictionary of containing resume information.
+
 ## Full Notebook Examples
 
 Check out these notebooks for more detailed examples of using both sync and async AnyParser.

diff --git a/sdk-reference/extract_tables.mdx b/sdk-reference/extract_tables.mdx
@@ -35,11 +35,13 @@ from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-local_file_path = "/path/to/your/file"
-
-pii_result, total_time = ap.extract_tables(local_file_path)
+table_result, total_time = ap.extract_tables(file_path="/path/to/your/file")
 ```
 
+## Output
+
+A string containing all the tables in HTML format. 
+
 ## Full Notebook Examples
 
 Check out these notebooks for more detailed examples of using both sync and async AnyParser.

diff --git a/sdk-reference/parse.mdx b/sdk-reference/parse.mdx
@@ -16,50 +16,28 @@ from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-md_output, total_time = ap.parse(file_path="./data/test.pdf")
+md_output, total_time = ap.parse(file_path="/path/to/your/file")
 ```
 
 ```python anyparser_async.py
 from any_parser import AnyParser
 
 ap = AnyParser(api_key="...")
 
-file_id = ap.async_parse(file_path="./data/test.pdf")
+file_id = ap.async_parse(file_path="/path/to/your/file")
 
 md = ap.async_fetch(file_id=file_id)
 ```
 
-## Advanced Parsing
+## Output
 
-The `parse` API will invoke a general-purpose model that is optimized for a wide range of document types. 
-CambioML also provides more advanced models that is optimized for more complex documents.
-
-
-### Async Parse with OCR
-
-The `Parse with OCR` model refines parsing results by applying OCR detection and correction techniques.
-```python
-# start the parsing request
-file_id = ap.async_parse_with_ocr(example_local_file)
-# fetch results (5s polling up to 60s)
-markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5)
-```
-
-### Async Parse with Layout Model
-The `Parse with Layout` model first analyzes the layout information of the file,
-then processes each element separately using specialized models tailored for different content types.
-```python
-# start the parsing request
-file_id = ap.async_parse_with_layout(example_local_file)
-# fetch results (5s polling up to 60s)
-markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5)
-```
+A string containing the markdown representation of the given file.
 
 ## Full Notebook Examples
 
 Check out these notebooks for more detailed examples of using AnyParser BASE and PRO models:
-- [AnyParser Sync API](https://github.com/CambioML/any-parser/blob/main/examples/pdf_to_markdown.ipynb): Parse 1-2 pages short documents (which will time out after 30 seconds).
-- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_pdf_to_markdown.ipynb): Parse longer documents (which may take longer than 30 seconds).
+- [AnyParser Sync API](https://github.com/CambioML/any-parser/blob/main/examples/parse_pdf.ipynb): Parse 1-2 pages short documents (which will time out after 30 seconds).
+- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_parse_pdf.ipynb): Parse longer documents (which may take longer than 30 seconds).
 
 <CardGroup cols={2}>
   <Card

diff --git a/sdk-reference/parse_with_layout.mdx b/sdk-reference/parse_with_layout.mdx
@@ -0,0 +1,43 @@
+---
+title: 'Parse With Layout'
+description: 'Parse the full content from your documents into markdown format.'
+---
+
+## Overview
+Using `AnyParser`, you can parse the full content from your documents into markdown. 
+The `Parse with Layout` model first analyzes the layout information of the file,
+then processes each element separately using specialized models tailored for different content types.
+
+## Setup
+Refer to the [Quickstart guide](/quickstart/#setup) to install the AnyParser SDK and get your api key.
+
+Next, set up your `AnyParser` sync or async client.
+
+```python anyparser_async.py
+from any_parser import AnyParser
+
+ap = AnyParser(api_key="...")
+# start the parsing request
+file_id = ap.async_parse_with_layout(file_path="/path/to/your/file")
+# fetch results (5s polling up to 60s)
+markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5)
+```
+
+## Output
+
+A string containing the markdown representation of the given file.
+
+## Full Notebook Examples
+
+Check out these notebooks for more detailed examples of using AnyParser BASE and PRO models:
+- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_parse_with_layout.ipynb): Parse longer documents (which may take longer than 30 seconds).
+
+<CardGroup cols={1}>
+  <Card
+    title="AnyParser Async Parse Example"
+    icon="rotate"
+    href="https://github.com/CambioML/any-parser/blob/main/examples/async_parse_with_layout.ipynb"
+  >
+    Extracting content from a table of contents.
+  </Card>
+</CardGroup>
diff --git a/sdk-reference/parse_with_ocr.mdx b/sdk-reference/parse_with_ocr.mdx
@@ -0,0 +1,43 @@
+---
+title: 'Parse With OCR'
+description: 'Parse the full content from your documents into markdown format.'
+---
+
+## Overview
+Using `AnyParser`, you can parse the full content from your documents into markdown.
+The `Parse with OCR` model refines parsing results by applying OCR detection and correction techniques.
+
+## Setup
+Refer to the [Quickstart guide](/quickstart/#setup) to install the AnyParser SDK and get your api key.
+
+Next, set up your `AnyParser` sync or async client.
+
+
+```python anyparser_async.py
+from any_parser import AnyParser
+
+ap = AnyParser(api_key="...")
+# start the parsing request
+file_id = ap.async_parse_with_ocr(file_path="/path/to/your/file")
+# fetch results (5s polling up to 60s)
+markdown_string = ap.async_fetch(file_id, sync=True, sync_timeout=60, sync_interval=5)
+```
+
+## Output
+
+A string containing the markdown representation of the given file.
+
+## Full Notebook Examples
+
+Check out these notebooks for more detailed examples of using AnyParser BASE and PRO models:
+- [AnyParser Async API](https://github.com/CambioML/any-parser/blob/main/examples/async_parse_with_ocr.ipynb): Parse longer documents (which may take longer than 30 seconds).
+
+<CardGroup cols={1}>
+  <Card
+    title="AnyParser Async Parse Example"
+    icon="rotate"
+    href="https://github.com/CambioML/any-parser/blob/main/examples/async_parse_with_ocr.ipynb"
+  >
+    Extracting content from a table of contents.
+  </Card>
+</CardGroup>