You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
sample_rag_langchain.ipynb : "message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
#45
Open
aantonellims opened this issue
Jul 17, 2024
· 0 comments
I'm facing an issue :
I first used the sample_figure_understanding.ipynb notebook and get markdown files generated.
Then I used the sample_rag_langchain.ipynb to split and index my md file into an Azure Search, but facing the following issue :
HttpResponseError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_25944\1031862820.py in ?()
22
23 # Initiate Azure AI Document Intelligence to load the document
24 loader = AzureAIDocumentIntelligenceLoader(file_path=file_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
25
---> 26 docs = loader.load()
27
28 # Assuming each file contains a single document for simplicity
29 docs_string = docs[0].page_content
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self)
28 def load(self) -> List[Document]:
29 """Load data into Document objects."""
---> 30 return list(self.lazy_load())
c:\Python311\Lib\site-packages\langchain_community\document_loaders\doc_intelligence.py in ?(self)
92 ) -> Iterator[Document]:
93 """Lazy load given path as pages."""
94 if self.file_path is not None:
95 blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
---> 96 yield from self.parser.parse(blob)
97 else:
98 yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self, blob)
122
123 Returns:
124 List of documents
125 """
--> 126 return list(self.lazy_parse(blob))
c:\Python311\Lib\site-packages\langchain_community\document_loaders\parsers\doc_intelligence.py in ?(self, blob)
76 def lazy_parse(self, blob: Blob) -> Iterator[Document]:
77 """Lazily parse the blob."""
78
79 with blob.as_bytes_io() as file_obj:
---> 80 poller = self.client.begin_analyze_document(
81 self.api_model,
82 file_obj,
83 content_type="application/octet-stream",
c:\Python311\Lib\site-packages\azure\core\tracing\decorator.py in ?(*args, **kwargs)
74 passed_in_parent = kwargs.pop("parent_span", None)
75
76 span_impl_type = settings.tracing_implementation()
77 if span_impl_type is None:
---> 78 return func(*args, **kwargs)
79
80 # Merge span is parameter is set, but only if no explicit parent are passed
81 if merge_span and not passed_in_parent:
c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
514 if _stream:
515 response.read() # Load the body in memory and close the socket
516 map_error(status_code=response.status_code, response=response, error_map=error_map)
517 error = _deserialize(_models.ErrorResponse, response.json())
--> 518 raise HttpResponseError(response=response, model=error)
519
520 response_headers = {}
521 response_headers["Retry-After"] = self._deserialize("int", response.headers.get("Retry-After"))
HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
"code": "InvalidContent",
"message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}
When I look the markdown file generated, I can see that titles (#) are represented by "==="
(I tried to manually make the change, but still facing the same issue, can anybody help ?
The text was updated successfully, but these errors were encountered:
Hello,
I'm facing an issue :
I first used the sample_figure_understanding.ipynb notebook and get markdown files generated.
Then I used the sample_rag_langchain.ipynb to split and index my md file into an Azure Search, but facing the following issue :
HttpResponseError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_25944\1031862820.py in ?()
22
23 # Initiate Azure AI Document Intelligence to load the document
24 loader = AzureAIDocumentIntelligenceLoader(file_path=file_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
25
---> 26 docs = loader.load()
27
28 # Assuming each file contains a single document for simplicity
29 docs_string = docs[0].page_content
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self)
28 def load(self) -> List[Document]:
29 """Load data into Document objects."""
---> 30 return list(self.lazy_load())
c:\Python311\Lib\site-packages\langchain_community\document_loaders\doc_intelligence.py in ?(self)
92 ) -> Iterator[Document]:
93 """Lazy load given path as pages."""
94 if self.file_path is not None:
95 blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
---> 96 yield from self.parser.parse(blob)
97 else:
98 yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self, blob)
122
123 Returns:
124 List of documents
125 """
--> 126 return list(self.lazy_parse(blob))
c:\Python311\Lib\site-packages\langchain_community\document_loaders\parsers\doc_intelligence.py in ?(self, blob)
76 def lazy_parse(self, blob: Blob) -> Iterator[Document]:
77 """Lazily parse the blob."""
78
79 with blob.as_bytes_io() as file_obj:
---> 80 poller = self.client.begin_analyze_document(
81 self.api_model,
82 file_obj,
83 content_type="application/octet-stream",
c:\Python311\Lib\site-packages\azure\core\tracing\decorator.py in ?(*args, **kwargs)
74 passed_in_parent = kwargs.pop("parent_span", None)
75
76 span_impl_type = settings.tracing_implementation()
77 if span_impl_type is None:
---> 78 return func(*args, **kwargs)
79
80 # Merge span is parameter is set, but only if no explicit parent are passed
81 if merge_span and not passed_in_parent:
c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
3623 polling: Union[bool, PollingMethod] = kwargs.pop("polling", True)
3624 lro_delay = kwargs.pop("polling_interval", self._config.polling_interval)
3625 cont_token: Optional[str] = kwargs.pop("continuation_token", None)
3626 if cont_token is None:
-> 3627 raw_result = self._analyze_document_initial( # type: ignore
3628 model_id=model_id,
3629 analyze_request=analyze_request,
3630 pages=pages,
c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
514 if _stream:
515 response.read() # Load the body in memory and close the socket
516 map_error(status_code=response.status_code, response=response, error_map=error_map)
517 error = _deserialize(_models.ErrorResponse, response.json())
--> 518 raise HttpResponseError(response=response, model=error)
519
520 response_headers = {}
521 response_headers["Retry-After"] = self._deserialize("int", response.headers.get("Retry-After"))
HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
"code": "InvalidContent",
"message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}
When I look the markdown file generated, I can see that titles (#) are represented by "==="
(I tried to manually make the change, but still facing the same issue, can anybody help ?
The text was updated successfully, but these errors were encountered: