diff --git a/llama_parse/base.py b/llama_parse/base.py index e53a7db..a3e5320 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -20,20 +20,7 @@ ) from copy import deepcopy - -def _get_sub_docs(docs: List[Document]) -> List[Document]: - """Split docs into pages, by separator.""" - sub_docs = [] - for doc in docs: - doc_chunks = doc.text.split("\n---\n") - for doc_chunk in doc_chunks: - sub_doc = Document( - text=doc_chunk, - metadata=deepcopy(doc.metadata), - ) - sub_docs.append(sub_doc) - - return sub_docs +_DEFAULT_SEPARATOR = "\n---\n" class LlamaParse(BasePydanticReader): @@ -119,7 +106,7 @@ class LlamaParse(BasePydanticReader): ) split_by_page: bool = Field( default=True, - description="Whether to split by page (NOTE: using a predefined separator `\n---\n`)", + description="Whether to split by page using the page separator", ) @validator("api_key", pre=True, always=True) @@ -259,7 +246,7 @@ async def _aload_data( ) ] if self.split_by_page: - return _get_sub_docs(docs) + return self._get_sub_docs(docs) else: return docs @@ -423,3 +410,18 @@ def get_images(self, json_result: List[dict], download_path: str) -> List[dict]: return [] else: raise e + + def _get_sub_docs(self, docs: List[Document]) -> List[Document]: + """Split docs into pages, by separator.""" + sub_docs = [] + separator = self.page_separator or _DEFAULT_SEPARATOR + for doc in docs: + doc_chunks = doc.text.split(separator) + for doc_chunk in doc_chunks: + sub_doc = Document( + text=doc_chunk, + metadata=deepcopy(doc.metadata), + ) + sub_docs.append(sub_doc) + + return sub_docs