bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

suhaif314 · 2024-11-21T13:30:35Z

Describe the bug
I am trying to load the partionpdf from the unstructured library, I have tried many ways like creating a path in the global variable, and adding the PATH to the (OCR agent)

To Reproduce

raw_pdf_element = partition_pdf(
filename= r"C:\Users\Documents\Practice_myself\data\2206.01062.pdf",
strategy='hi_res',
extract_images_in_pdf=True,
extract_image_block_types=["Image", "table"],
extract_image_block_to_payload=False,
extract_image_block_output_dir='extracted_data'

)

Expected behavior
I expect that the tesseract.exe it can't able to read it.

Screenshots

Environment Info
{
"name": "TesseractNotFoundError",
"message": "tesseract is not installed or it's not in your PATH. See README file for more information.",
"stack": "---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:451, in get_tesseract_version()
450 try:
--> 451 output = subprocess.check_output(
452 [tesseract_cmd, '--version'],
453 stderr=subprocess.STDOUT,
454 env=environ,
455 stdin=subprocess.DEVNULL,
456 )
457 except OSError:

File C:\Program Files\Python312\Lib\subprocess.py:466, in check_output(timeout, *popenargs, **kwargs)
464 kwargs['input'] = empty
--> 466 return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
467 **kwargs).stdout

File C:\Program Files\Python312\Lib\subprocess.py:548, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
546 kwargs['stderr'] = PIPE
--> 548 with Popen(*popenargs, **kwargs) as process:
549 try:

File C:\Program Files\Python312\Lib\subprocess.py:1026, in Popen.init(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)
1023 self.stderr = io.TextIOWrapper(self.stderr,
1024 encoding=encoding, errors=errors)
-> 1026 self._execute_child(args, executable, preexec_fn, close_fds,
1027 pass_fds, cwd, env,
1028 startupinfo, creationflags, shell,
1029 p2cread, p2cwrite,
1030 c2pread, c2pwrite,
1031 errread, errwrite,
1032 restore_signals,
1033 gid, gids, uid, umask,
1034 start_new_session, process_group)
1035 except:
1036 # Cleanup if the child failed starting.

File C:\Program Files\Python312\Lib\subprocess.py:1538, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)
1537 try:
-> 1538 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1539 # no special security
1540 None, None,
1541 int(not close_fds),
1542 creationflags,
1543 env,
1544 cwd,
1545 startupinfo)
1546 finally:
1547 # Child is launched. Close the parent's copy of those pipe
1548 # handles that only the child should have open. You need
(...)
1551 # pipe will not close when the child process exits and the
1552 # ReadFile will hang.

FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

TesseractNotFoundError Traceback (most recent call last)
Cell In[21], line 1
----> 1 raw_pdf_element = partition_pdf(
2 filename= r"C:\Users\Documents\Practice_myself\data\2206.01062.pdf",
3 strategy='hi_res',
4 extract_images_in_pdf=True,
5 extract_image_block_types=["Image", "table"],
6 extract_image_block_to_payload=False,
7 extract_image_block_output_dir='extracted_data'
8
9 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\documents\elements.py:578, in process_metadata..decorator..wrapper(*args, **kwargs)
576 @functools.wraps(func)
577 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 578 elements = func(*args, **kwargs)
579 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
581 unique_element_ids: bool = call_args.get("unique_element_ids", False)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:725, in add_filetype..decorator..wrapper(*args, **kwargs)
723 @functools.wraps(func)
724 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 725 elements = func(*args, **kwargs)
727 for element in elements:
728 # NOTE(robinson) - Attached files have already run through this logic
729 # in their own partitioning function
730 if element.metadata.attached_to_filename is None:

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\file_utils\filetype.py:683, in add_metadata..wrapper(*args, **kwargs)
681 @functools.wraps(func)
682 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 683 elements = func(*args, **kwargs)
684 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
686 if call_args.get("metadata_filename"):

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, **kwargs)
71 """The decorated function is replaced with this one."""
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, **kwargs)
76 # -- look for a chunking-strategy argument --
77 call_args = get_call_args_applying_defaults(func, *args, **kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:209, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
205 exactly_one(filename=filename, file=file)
207 languages = check_language_args(languages or [], ocr_languages)
--> 209 return partition_pdf_or_image(
210 filename=filename,
211 file=file,
212 include_page_breaks=include_page_breaks,
213 strategy=strategy,
214 infer_table_structure=infer_table_structure,
215 languages=languages,
216 metadata_last_modified=metadata_last_modified,
217 hi_res_model_name=hi_res_model_name,
218 extract_images_in_pdf=extract_images_in_pdf,
219 extract_image_block_types=extract_image_block_types,
220 extract_image_block_output_dir=extract_image_block_output_dir,
221 extract_image_block_to_payload=extract_image_block_to_payload,
222 starting_page_number=starting_page_number,
223 extract_forms=extract_forms,
224 form_extraction_skip_tables=form_extraction_skip_tables,
225 **kwargs,
226 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:305, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
303 with warnings.catch_warnings():
304 warnings.simplefilter("ignore")
--> 305 elements = _partition_pdf_or_image_local(
306 filename=filename,
307 file=spooled_to_bytes_io_if_needed(file),
308 is_image=is_image,
309 infer_table_structure=infer_table_structure,
310 include_page_breaks=include_page_breaks,
311 languages=languages,
312 ocr_languages=ocr_languages,
313 metadata_last_modified=metadata_last_modified or last_modified,
314 hi_res_model_name=hi_res_model_name,
315 pdf_text_extractable=pdf_text_extractable,
316 extract_images_in_pdf=extract_images_in_pdf,
317 extract_image_block_types=extract_image_block_types,
318 extract_image_block_output_dir=extract_image_block_output_dir,
319 extract_image_block_to_payload=extract_image_block_to_payload,
320 starting_page_number=starting_page_number,
321 extract_forms=extract_forms,
322 form_extraction_skip_tables=form_extraction_skip_tables,
323 **kwargs,
324 )
325 out_elements = _process_uncategorized_text_elements(elements)
327 elif strategy == PartitionStrategy.FAST:

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, **kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
215 run_check()
--> 216 return func(*args, **kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf.py:626, in _partition_pdf_or_image_local(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, **kwargs)
619 # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
620 merged_document_layout = merge_inferred_with_extracted_layout(
621 inferred_document_layout=inferred_document_layout,
622 extracted_layout=extracted_layout,
623 hi_res_model_name=hi_res_model_name,
624 )
--> 626 final_document_layout = process_file_with_ocr(
627 filename,
628 merged_document_layout,
629 extracted_layout=extracted_layout,
630 is_image=is_image,
631 infer_table_structure=infer_table_structure,
632 ocr_languages=ocr_languages,
633 ocr_mode=ocr_mode,
634 pdf_image_dpi=pdf_image_dpi,
635 ocr_layout_dumper=ocr_layout_dumper,
636 )
637 else:
638 inferred_document_layout = process_data_with_model(
639 file,
640 is_image=is_image,
641 model_name=hi_res_model_name,
642 pdf_image_dpi=pdf_image_dpi,
643 )

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, **kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
215 run_check()
--> 216 return func(*args, **kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:178, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper)
176 except Exception as e:
177 if os.path.isdir(filename) or os.path.isfile(filename):
--> 178 raise e
179 else:
180 raise FileNotFoundError(f'File "{filename}" not found!') from e

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:165, in process_file_with_ocr(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper)
163 extracted_regions = extracted_layout[i] if i < len(extracted_layout) else None
164 with PILImage.open(image_path) as image:
--> 165 merged_page_layout = supplement_page_layout_with_ocr(
166 page_layout=out_layout.pages[i],
167 image=image,
168 infer_table_structure=infer_table_structure,
169 ocr_languages=ocr_languages,
170 ocr_mode=ocr_mode,
171 extracted_regions=extracted_regions,
172 ocr_layout_dumper=ocr_layout_dumper,
173 )
174 merged_page_layouts.append(merged_page_layout)
175 return DocumentLayout.from_pages(merged_page_layouts)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\utils.py:216, in requires_dependencies..decorator..wrapper(*args, **kwargs)
213 @wraps(func)
214 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
215 run_check()
--> 216 return func(*args, **kwargs)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\pdf_image\ocr.py:203, in supplement_page_layout_with_ocr(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)
201 ocr_agent = OCRAgent.get_agent(language=ocr_languages)
202 if ocr_mode == OCRMode.FULL_PAGE.value:
--> 203 ocr_layout = ocr_agent.get_layout_from_image(image)
204 if ocr_layout_dumper:
205 ocr_layout_dumper.add_ocred_page(ocr_layout)

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured\partition\utils\ocr_models\tesseract_ocr.py:50, in OCRAgentTesseract.get_layout_from_image(self, image)
48 trace_logger.detail("Processing entire page OCR with tesseract...")
49 zoom = 1
---> 50 ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
51 np.array(image),
52 lang=self.language,
53 output_type=Output.DATAFRAME,
54 )
55 ocr_df = ocr_df.dropna()
57 # tesseract performance degrades when the text height is out of the preferred zone so we
58 # zoom the image (in or out depending on estimated text height) for optimum OCR results
59 # but this needs to be evaluated based on actual use case as the optimum scaling also
60 # depend on type of characters (font, language, etc); be careful about this
61 # functionality

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:590, in image_to_data(image, lang, config, nice, output_type, timeout, pandas_config)
576 def image_to_data(
577 image,
578 lang=None,
(...)
583 pandas_config=None,
584 ):
585 """
586 Returns string containing box boundaries, confidences,
587 and other information. Requires Tesseract 3.05+
588 """
--> 590 if get_tesseract_version(cached=True) < TESSERACT_MIN_VERSION:
591 raise TSVNotSupported()
593 config = f'-c tessedit_create_tsv=1 {config.strip()}'

File c:\User\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:163, in run_once..wrapper(*args, **kwargs)
160 @wraps(func)
161 def wrapper(*args, **kwargs):
162 if not kwargs.pop('cached', False) or wrapper._result is wrapper:
--> 163 wrapper._result = func(*args, **kwargs)
164 return wrapper._result

File c:\Users\Documents\Practice_myself\Extraction\Unstructured\.venv\Lib\site-packages\unstructured_pytesseract\pytesseract.py:458, in get_tesseract_version()
451 output = subprocess.check_output(
452 [tesseract_cmd, '--version'],
453 stderr=subprocess.STDOUT,
454 env=environ,
455 stdin=subprocess.DEVNULL,
456 )
457 except OSError:
--> 458 raise TesseractNotFoundError()
460 raw_version = output.decode(DEFAULT_ENCODING)
461 str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(' ')

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information."
}

Additional context
Add any other context about the problem here.

suhaif314 added the bug Something isn't working label Nov 21, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

suhaif314 commented Nov 21, 2024

bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

bug/<short-name>Unstructured Partition PDF , tesseract ERROR!!! #3789

Comments

suhaif314 commented Nov 21, 2024