Skip to content

Commit

Permalink
Move _get_sub_docs to private function
Browse files Browse the repository at this point in the history
  • Loading branch information
adreichert committed Jun 27, 2024
1 parent 1bbf5f4 commit b644065
Showing 1 changed file with 18 additions and 16 deletions.
34 changes: 18 additions & 16 deletions llama_parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,7 @@
)
from copy import deepcopy


def _get_sub_docs(docs: List[Document]) -> List[Document]:
"""Split docs into pages, by separator."""
sub_docs = []
for doc in docs:
doc_chunks = doc.text.split("\n---\n")
for doc_chunk in doc_chunks:
sub_doc = Document(
text=doc_chunk,
metadata=deepcopy(doc.metadata),
)
sub_docs.append(sub_doc)

return sub_docs
_DEFAULT_SEPARATOR = "\n---\n"


class LlamaParse(BasePydanticReader):
Expand Down Expand Up @@ -119,7 +106,7 @@ class LlamaParse(BasePydanticReader):
)
split_by_page: bool = Field(
default=True,
description="Whether to split by page (NOTE: using a predefined separator `\n---\n`)",
description="Whether to split by page using the page separator",
)

@validator("api_key", pre=True, always=True)
Expand Down Expand Up @@ -259,7 +246,7 @@ async def _aload_data(
)
]
if self.split_by_page:
return _get_sub_docs(docs)
return self._get_sub_docs(docs)
else:
return docs

Expand Down Expand Up @@ -423,3 +410,18 @@ def get_images(self, json_result: List[dict], download_path: str) -> List[dict]:
return []
else:
raise e

def _get_sub_docs(self, docs: List[Document]) -> List[Document]:
"""Split docs into pages, by separator."""
sub_docs = []
separator = self.page_separator or _DEFAULT_SEPARATOR
for doc in docs:
doc_chunks = doc.text.split(separator)
for doc_chunk in doc_chunks:
sub_doc = Document(
text=doc_chunk,
metadata=deepcopy(doc.metadata),
)
sub_docs.append(sub_doc)

return sub_docs

0 comments on commit b644065

Please sign in to comment.