Skip to content

Commit

Permalink
IMPROVEMENT: Reduce post-processing time for DocAIParser (langchain…
Browse files Browse the repository at this point in the history
…-ai#13210)

- Remove `WrappedDocument` introduced in
langchain-ai#11413
- googleapis/python-documentai-toolbox#198 in
Document AI Toolbox to improve initialization time for `WrappedDocument`
object.

@lkuligin

@baskaryan

@hwchase17
  • Loading branch information
holtskinner authored Nov 20, 2023
1 parent f3fcdea commit 1c08dbf
Showing 1 changed file with 13 additions and 21 deletions.
34 changes: 13 additions & 21 deletions libs/langchain/langchain/document_loaders/parsers/docai.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ def __init__(
"a client."
)

pattern = "projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
pattern = r"projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+"
if processor_name and not re.fullmatch(pattern, processor_name):
raise ValueError(
f"Processor name {processor_name} has a wrong format. If your "
f"Processor name {processor_name} has the wrong format. If your "
"prediction endpoint looks like https://us-documentai.googleapis.com"
"/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process,"
" use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID "
Expand Down Expand Up @@ -139,9 +139,7 @@ def online_process(
" `pip install google-cloud-documentai`"
) from exc
try:
from google.cloud.documentai_toolbox.wrappers.document import (
Document as WrappedDocument,
)
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
except ImportError as exc:
raise ImportError(
"documentai_toolbox package not found, please install it with"
Expand Down Expand Up @@ -171,16 +169,15 @@ def online_process(
field_mask=field_mask,
)
)
wrapped_document = WrappedDocument.from_documentai_document(response.document)
yield from (
Document(
page_content=page.text,
page_content=_text_from_layout(page.layout, response.document.text),
metadata={
"page": page.page_number,
"source": wrapped_document.gcs_input_uri,
"source": blob.path,
},
)
for page in wrapped_document.pages
for page in response.document.pages
)

def batch_parse(
Expand Down Expand Up @@ -239,28 +236,23 @@ def parse_from_results(
from google.cloud.documentai_toolbox.utilities.gcs_utilities import (
split_gcs_uri,
)
from google.cloud.documentai_toolbox.wrappers.document import (
Document as WrappedDocument,
)
from google.cloud.documentai_toolbox.wrappers.document import _get_shards
from google.cloud.documentai_toolbox.wrappers.page import _text_from_layout
except ImportError as exc:
raise ImportError(
"documentai_toolbox package not found, please install it with"
" `pip install google-cloud-documentai-toolbox`"
) from exc
for result in results:
gcs_bucket_name, gcs_prefix = split_gcs_uri(result.parsed_path)
wrapped_document = WrappedDocument.from_gcs(
gcs_bucket_name, gcs_prefix, gcs_input_uri=result.source_path
)
shards = _get_shards(gcs_bucket_name, gcs_prefix)
yield from (
Document(
page_content=page.text,
metadata={
"page": page.page_number,
"source": wrapped_document.gcs_input_uri,
},
page_content=_text_from_layout(page.layout, shard.text),
metadata={"page": page.page_number, "source": result.source_path},
)
for page in wrapped_document.pages
for shard in shards
for page in shard.pages
)

def operations_from_names(self, operation_names: List[str]) -> List["Operation"]:
Expand Down

0 comments on commit 1c08dbf

Please sign in to comment.