From df156ebe5ac4427ec7e2541e99cabb032801721d Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Thu, 31 Oct 2024 09:52:27 -0700 Subject: [PATCH] feat: support pdf link extraction in hi_res strategy (#3753) This PR aims to add support for link extraction in pdf `hi_res` strategy. The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents. ### Summary - Added functionalities to support link extraction in hi_res flow - Enhanced word extraction functionality used for link extraction in both `fast` and `hi_res` flows, resulted in more correct `start_index` and `text` in `links` metadata. - Updated ingest fixture update workflow to not skip Astra DB source test ### Testing ``` elements = partition_pdf( filename="example-docs/pdf/embedded-link.pdf", strategy="hi_res" ) assert len(elements[0].metadata.links) == 3 ``` --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub Co-authored-by: cragwolfe --- .../ingest-test-fixtures-update-pr.yml | 2 + CHANGELOG.md | 4 +- requirements/ingest/ingest.txt | 2 +- .../partition/common/test_common.py | 78 +-- .../partition/pdf_image/test_pdf.py | 141 ++++- ...75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json | 20 + ...97eea-73d7-4fca-a97e-ea73d7cfca62.csv.json | 20 + ...d99e3-9941-4c18-9d99-e399414c183d.csv.json | 20 + ...c0093-2277-4f3e-ac00-932277af3e0e.csv.json | 20 + ...0df94-0b3a-4f89-80df-940b3a6f8966.csv.json | 20 + .../25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.json | 98 --- .../60297eea-73d7-4fca-a97e-ea73d7cfca62.json | 26 - .../641d99e3-9941-4c18-9d99-e399414c183d.json | 98 --- .../762c0093-2277-4f3e-ac00-932277af3e0e.json | 98 --- .../ae40df94-0b3a-4f89-80df-940b3a6f8966.json | 146 ----- .../biomed-api/65/11/main.PMC6312790.pdf.json | 137 +++++ .../biomed-api/75/29/main.PMC6312793.pdf.json | 168 +++++ .../layout-parser-paper.pdf.json | 578 ++++++++++++++++++ .../biomed-api/65/11/main.PMC6312790.pdf.json | 34 +- .../biomed-api/75/29/main.PMC6312793.pdf.json | 52 +- .../s3/page-with-formula.pdf.json | 8 +- .../s3/page-with-formula.pdf.json | 38 ++ unstructured/__version__.py | 2 +- unstructured/partition/common/common.py | 117 +--- unstructured/partition/pdf.py | 426 ++++--------- .../pdf_image/pdfminer_processing.py | 404 +++++++++++- 26 files changed, 1718 insertions(+), 1039 deletions(-) create mode 100644 test_unstructured_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json create mode 100644 test_unstructured_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json create mode 100644 test_unstructured_ingest/expected-structured-output/astradb/641d99e3-9941-4c18-9d99-e399414c183d.csv.json create mode 100644 test_unstructured_ingest/expected-structured-output/astradb/762c0093-2277-4f3e-ac00-932277af3e0e.csv.json create mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.json delete mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.json delete mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/60297eea-73d7-4fca-a97e-ea73d7cfca62.json delete mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/641d99e3-9941-4c18-9d99-e399414c183d.json delete mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/762c0093-2277-4f3e-ac00-932277af3e0e.json delete mode 100644 test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/ae40df94-0b3a-4f89-80df-940b3a6f8966.json diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index f724e8dfc0..632f4ac3e1 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -94,6 +94,8 @@ jobs: AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }} + ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} + ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" OVERWRITE_FIXTURES: "true" CI: "true" diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae389092f..683413210d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.4-dev2 +## 0.16.4 ### Enhancements @@ -9,6 +9,8 @@ ### Features +* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. + ### Fixes diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 3fe2a4950b..6c99d3cfcd 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,4 +1,4 @@ -unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.0 +unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1 s3fs>=2024.9.0 urllib3>=1.26.20 backoff>=2.2.1 diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py index 441f0c51f9..277d7ace52 100644 --- a/test_unstructured/partition/common/test_common.py +++ b/test_unstructured/partition/common/test_common.py @@ -1,14 +1,11 @@ import pathlib -from dataclasses import dataclass from multiprocessing import Pool -from unittest import mock import numpy as np import pytest from PIL import Image from unstructured_inference.inference import layout from unstructured_inference.inference.elements import TextRegion -from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import LayoutElement from test_unstructured.unit_utils import example_doc_path @@ -29,7 +26,6 @@ Image as ImageElement, ) from unstructured.partition.common import common -from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT class MockPageLayout(layout.PageLayout): @@ -399,84 +395,12 @@ def test_contains_emoji(text, expected): assert common.contains_emoji(text) is expected -def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): - layout_elem_absent_coordinates = MockDocumentLayout() - for page in layout_elem_absent_coordinates.pages: - for el in page.elements: - el.bbox = None - elements = common.document_to_element_list(layout_elem_absent_coordinates) - assert elements[0].metadata.coordinates is None - - def test_get_page_image_metadata_and_coordinate_system(): doc = MockDocumentLayout() - metadata = common._get_page_image_metadata(doc.pages[0]) + metadata = common.get_page_image_metadata(doc.pages[0]) assert isinstance(metadata, dict) -@dataclass -class MockImage: - width = 640 - height = 480 - format = "JPG" - - -def test_document_to_element_list_handles_parent(): - block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") - block2 = LayoutElement.from_coords( - 1, - 2, - 3, - 4, - text="block 2", - parent=block1, - type="NarrativeText", - ) - page = PageLayout( - number=1, - image=MockImage(), - ) - page.elements = [block1, block2] - doc = DocumentLayout.from_pages([page]) - el1, el2 = common.document_to_element_list(doc) - assert el2.metadata.parent_id == el1.id - - -@pytest.mark.parametrize( - ("sort_mode", "call_count"), - [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)], -) -def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count): - block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") - block2 = LayoutElement.from_coords( - 1, - 2, - 3, - 4, - text="block 2", - parent=block1, - type="NarrativeText", - ) - page = PageLayout( - number=1, - image=MockImage(), - ) - page.elements = [block1, block2] - doc = DocumentLayout.from_pages([page]) - with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements: - common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode) - assert mock_sort_page_elements.call_count == call_count - - -def test_document_to_element_list_sets_category_depth_titles(): - layout_with_hierarchies = MockDocumentLayout() - elements = common.document_to_element_list(layout_with_hierarchies) - assert elements[0].metadata.category_depth == 1 - assert elements[1].metadata.category_depth == 2 - assert elements[2].metadata.category_depth is None - assert elements[3].metadata.category_depth == 0 - - def test_ocr_data_to_elements( filename=example_doc_path("img/layout-parser-paper-fast.jpg"), ): diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index ac780caf7f..cea6b44129 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -5,6 +5,7 @@ import math import os import tempfile +from dataclasses import dataclass from pathlib import Path from tempfile import SpooledTemporaryFile from unittest import mock @@ -14,6 +15,8 @@ from PIL import Image from pytest_mock import MockFixture from unstructured_inference.inference import layout +from unstructured_inference.inference.layout import DocumentLayout, PageLayout +from unstructured_inference.inference.layoutelement import LayoutElement from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from unstructured.chunking.title import chunk_by_title @@ -32,9 +35,12 @@ ) from unstructured.errors import PageCountExceededError from unstructured.partition import pdf, strategies -from unstructured.partition.pdf import get_uris_from_annots from unstructured.partition.pdf_image import ocr, pdfminer_processing +from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots from unstructured.partition.utils.constants import ( + SORT_MODE_BASIC, + SORT_MODE_DONT, + SORT_MODE_XY_CUT, UNSTRUCTURED_INCLUDE_DEBUG_METADATA, PartitionStrategy, ) @@ -95,6 +101,37 @@ def __init__(self, number: int, image: Image): ] +class MockSinglePageLayout(layout.PageLayout): + def __init__(self, number: int, image: Image.Image): + self.number = number + self.image = image + + @property + def elements(self): + return [ + LayoutElement( + type="Headline", + text="Charlie Brown and the Great Pumpkin", + bbox=None, + ), + LayoutElement( + type="Subheadline", + text="The Beginning", + bbox=None, + ), + LayoutElement( + type="Text", + text="This time Charlie Brown had it really tricky...", + bbox=None, + ), + LayoutElement( + type="Title", + text="Another book title in the same page", + bbox=None, + ), + ] + + class MockDocumentLayout(layout.DocumentLayout): @property def pages(self): @@ -104,6 +141,14 @@ def pages(self): ] +class MockSinglePageDocumentLayout(layout.DocumentLayout): + @property + def pages(self): + return [ + MockSinglePageLayout(number=1, image=Image.new("1", (1, 1))), + ] + + @pytest.mark.parametrize( ("filename", "file"), [ @@ -787,11 +832,14 @@ def test_combine_numbered_list(filename): @pytest.mark.parametrize( - "filename", - [example_doc_path("pdf/layout-parser-paper-fast.pdf")], + ("filename", "strategy"), + [ + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), "fast"), + (example_doc_path("pdf/layout-parser-paper-fast.pdf"), "hi_res"), + ], ) -def test_partition_pdf_hyperlinks(filename): - elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) +def test_partition_pdf_hyperlinks(filename, strategy): + elements = pdf.partition_pdf(filename=filename, strategy=strategy) links = [ { "text": "8", @@ -813,11 +861,14 @@ def test_partition_pdf_hyperlinks(filename): @pytest.mark.parametrize( - "filename", - [example_doc_path("pdf/embedded-link.pdf")], + ("filename", "strategy"), + [ + (example_doc_path("pdf/embedded-link.pdf"), "fast"), + (example_doc_path("pdf/embedded-link.pdf"), "hi_res"), + ], ) -def test_partition_pdf_hyperlinks_multiple_lines(filename): - elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO) +def test_partition_pdf_hyperlinks_multiple_lines(filename, strategy): + elements = pdf.partition_pdf(filename=filename, strategy=strategy) assert elements[-1].metadata.links[-1]["text"] == "capturing" assert len(elements[-1].metadata.links) == 2 @@ -1392,3 +1443,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_ pdf_hi_res_max_pages=pdf_hi_res_max_pages, is_image=is_image, ) + + +def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): + layout_elem_absent_coordinates = MockSinglePageDocumentLayout() + for page in layout_elem_absent_coordinates.pages: + for el in page.elements: + el.bbox = None + elements = pdf.document_to_element_list(layout_elem_absent_coordinates) + assert elements[0].metadata.coordinates is None + + +@dataclass +class MockImage: + width = 640 + height = 480 + format = "JPG" + + +def test_document_to_element_list_handles_parent(): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + el1, el2 = pdf.document_to_element_list(doc) + assert el2.metadata.parent_id == el1.id + + +@pytest.mark.parametrize( + ("sort_mode", "call_count"), + [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)], +) +def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + with mock.patch.object(pdf, "sort_page_elements") as mock_sort_page_elements: + pdf.document_to_element_list(doc, sortable=True, sort_mode=sort_mode) + assert mock_sort_page_elements.call_count == call_count + + +def test_document_to_element_list_sets_category_depth_titles(): + layout_with_hierarchies = MockSinglePageDocumentLayout() + elements = pdf.document_to_element_list(layout_with_hierarchies) + assert elements[0].metadata.category_depth == 1 + assert elements[1].metadata.category_depth == 2 + assert elements[2].metadata.category_depth is None + assert elements[3].metadata.category_depth == 0 diff --git a/test_unstructured_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json b/test_unstructured_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json new file mode 100644 index 0000000000..38a2308f2a --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/astradb/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.csv.json @@ -0,0 +1,20 @@ +[ + { + "type": "Table", + "element_id": "29fba2aa35cbdea208791e942ac3c40c", + "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7 City Hunter: Shinjuku Private Eyes 2558908 2019-02-14 Matt Schley 2.5/5 rotten The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.", + "metadata": { + "text_as_html": "
_idtitlereviewidcreationdatecriticnameoriginalscorereviewstatereviewtext
25b75f1d-a2ea-4c97-b75f-1da2eadc97f7City Hunter: Shinjuku Private Eyes25589082019-02-14Matt Schley2.5/5rottenThe film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.
", + "languages": [ + "eng" + ], + "filetype": "text/csv", + "data_source": { + "record_locator": { + "document_id": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7" + }, + "filesize_bytes": 326 + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json b/test_unstructured_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json new file mode 100644 index 0000000000..42fa39a313 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/astradb/60297eea-73d7-4fca-a97e-ea73d7cfca62.csv.json @@ -0,0 +1,20 @@ +[ + { + "type": "Table", + "element_id": "b3b034c9f8fb0ab442599982063f0590", + "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 60297eea-73d7-4fca-a97e-ea73d7cfca62 City Hunter: Shinjuku Private Eyes 2590987 2019-05-28 Reuben Baron fresh The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.", + "metadata": { + "text_as_html": "
_idtitlereviewidcreationdatecriticnameoriginalscorereviewstatereviewtext
60297eea-73d7-4fca-a97e-ea73d7cfca62City Hunter: Shinjuku Private Eyes25909872019-05-28Reuben BaronfreshThe choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.
", + "languages": [ + "eng" + ], + "filetype": "text/csv", + "data_source": { + "record_locator": { + "document_id": "60297eea-73d7-4fca-a97e-ea73d7cfca62" + }, + "filesize_bytes": 442 + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/641d99e3-9941-4c18-9d99-e399414c183d.csv.json b/test_unstructured_ingest/expected-structured-output/astradb/641d99e3-9941-4c18-9d99-e399414c183d.csv.json new file mode 100644 index 0000000000..c683874d39 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/astradb/641d99e3-9941-4c18-9d99-e399414c183d.csv.json @@ -0,0 +1,20 @@ +[ + { + "type": "Table", + "element_id": "5447d9b5c663c4bd69cd0b05dc6963f8", + "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 641d99e3-9941-4c18-9d99-e399414c183d Beavers 1145982 2003-05-23 Ivan M. Lincoln 3.5/4 fresh Timed to be just long enough for most youngsters' brief attention spans -- and it's packed with plenty of interesting activity, both on land and under the water.", + "metadata": { + "text_as_html": "
_idtitlereviewidcreationdatecriticnameoriginalscorereviewstatereviewtext
641d99e3-9941-4c18-9d99-e399414c183dBeavers11459822003-05-23Ivan M. Lincoln3.5/4freshTimed to be just long enough for most youngsters' brief attention spans -- and it's packed with plenty of interesting activity, both on land and under the water.
", + "languages": [ + "eng" + ], + "filetype": "text/csv", + "data_source": { + "record_locator": { + "document_id": "641d99e3-9941-4c18-9d99-e399414c183d" + }, + "filesize_bytes": 338 + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/762c0093-2277-4f3e-ac00-932277af3e0e.csv.json b/test_unstructured_ingest/expected-structured-output/astradb/762c0093-2277-4f3e-ac00-932277af3e0e.csv.json new file mode 100644 index 0000000000..86bae5eb8d --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/astradb/762c0093-2277-4f3e-ac00-932277af3e0e.csv.json @@ -0,0 +1,20 @@ +[ + { + "type": "Table", + "element_id": "58d819acf213d1eb12b54f806e907722", + "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 762c0093-2277-4f3e-ac00-932277af3e0e Blood Mask 1636744 2007-06-02 The Foywonder 1/5 rotten It doesn't matter if a movie costs 300 million or only 300 dollars; good is good and bad is bad, and Bloodmask: The Possession of Nicole Lameroux is just plain bad.", + "metadata": { + "text_as_html": "
_idtitlereviewidcreationdatecriticnameoriginalscorereviewstatereviewtext
762c0093-2277-4f3e-ac00-932277af3e0eBlood Mask16367442007-06-02The Foywonder1/5rottenIt doesn't matter if a movie costs 300 million or only 300 dollars; good is good and bad is bad, and Bloodmask: The Possession of Nicole Lameroux is just plain bad.
", + "languages": [ + "eng" + ], + "filetype": "text/csv", + "data_source": { + "record_locator": { + "document_id": "762c0093-2277-4f3e-ac00-932277af3e0e" + }, + "filesize_bytes": 341 + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.json b/test_unstructured_ingest/expected-structured-output/astradb/ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.json new file mode 100644 index 0000000000..da88c4066d --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/astradb/ae40df94-0b3a-4f89-80df-940b3a6f8966.csv.json @@ -0,0 +1,20 @@ +[ + { + "type": "Table", + "element_id": "7fc0ff06c2ca00682b8070d6f8668bc6", + "text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext ae40df94-0b3a-4f89-80df-940b3a6f8966 Dangerous Men 2504681 2018-08-29 Pat Padua fresh Its clumsy determination is endearing and sometimes wildly entertaining", + "metadata": { + "text_as_html": "
_idtitlereviewidcreationdatecriticnameoriginalscorereviewstatereviewtext
ae40df94-0b3a-4f89-80df-940b3a6f8966Dangerous Men25046812018-08-29Pat PaduafreshIts clumsy determination is endearing and sometimes wildly entertaining
", + "languages": [ + "eng" + ], + "filetype": "text/csv", + "data_source": { + "record_locator": { + "document_id": "ae40df94-0b3a-4f89-80df-940b3a6f8966" + }, + "filesize_bytes": 241 + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.json b/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.json deleted file mode 100644 index 279b3cdcc2..0000000000 --- a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/25b75f1d-a2ea-4c97-b75f-1da2eadc97f7.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "element_id": "f0a4e037e95409782d80f79ab482e0a6", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7", - "type": "UncategorizedText" - }, - { - "element_id": "5b07d26fd8dfe0d1eed55ade1646d117", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "City Hunter: Shinjuku Private Eyes", - "type": "Title" - }, - { - "element_id": "fa66f50dce49f55a8ee1a3b868660435", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2558908", - "type": "UncategorizedText" - }, - { - "element_id": "04ec2a9e0508b18cd1e74299f663646e", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2019-02-14", - "type": "UncategorizedText" - }, - { - "element_id": "614276b484bb8e257e9bd90610e1311b", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Matt Schley", - "type": "Title" - }, - { - "element_id": "840d7108dfb83582914b422aafeb5656", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2.5/5", - "type": "UncategorizedText" - }, - { - "element_id": "d252d04d0a940ef7fdadfeb802decdc6", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "rotten", - "type": "NarrativeText" - }, - { - "element_id": "c451f38624b0cc6014e1a1ea0e006a88", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/60297eea-73d7-4fca-a97e-ea73d7cfca62.json b/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/60297eea-73d7-4fca-a97e-ea73d7cfca62.json deleted file mode 100644 index 576d0a984b..0000000000 --- a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/60297eea-73d7-4fca-a97e-ea73d7cfca62.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "element_id": "92d743481b1262db7e93beb437b6c793", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "60297eea-73d7-4fca-a97e-ea73d7cfca62 City Hunter: Shinjuku Private Eyes 2590987 2019-05-28 Reuben Baron", - "type": "Title" - }, - { - "element_id": "7abd0e1bf81ec2fec0917ceece253c4f", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "fresh The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/641d99e3-9941-4c18-9d99-e399414c183d.json b/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/641d99e3-9941-4c18-9d99-e399414c183d.json deleted file mode 100644 index 0b146a1ff3..0000000000 --- a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/641d99e3-9941-4c18-9d99-e399414c183d.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "element_id": "7c4c8a27f2664fc2db5eac50c9105299", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "641d99e3-9941-4c18-9d99-e399414c183d", - "type": "UncategorizedText" - }, - { - "element_id": "d2674169f9d7a78a17b4cce81a30ab10", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Beavers", - "type": "Title" - }, - { - "element_id": "93f68debb2a48075f5cb933213331e0a", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "1145982", - "type": "UncategorizedText" - }, - { - "element_id": "602ff3900f9c245ffda521cb04dec673", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2003-05-23", - "type": "UncategorizedText" - }, - { - "element_id": "8cf307b6a94e7532c00f4abeae2909fd", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Ivan M. Lincoln", - "type": "Title" - }, - { - "element_id": "6d77cf17e6d39fc13bade34de54d6df0", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "3.5/4", - "type": "UncategorizedText" - }, - { - "element_id": "eca56c8a2a202ccaac07fe0d807c92a6", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "fresh", - "type": "Title" - }, - { - "element_id": "3e9ea85f9e12c9683dc6ad6fb3f58f1f", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Timed to be just long enough for most youngsters' brief attention spans -- and it's packed with plenty of interesting activity, both on land and under the water.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/762c0093-2277-4f3e-ac00-932277af3e0e.json b/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/762c0093-2277-4f3e-ac00-932277af3e0e.json deleted file mode 100644 index 96d42a6f61..0000000000 --- a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/762c0093-2277-4f3e-ac00-932277af3e0e.json +++ /dev/null @@ -1,98 +0,0 @@ -[ - { - "element_id": "6c97f8135d9625ab7e1b55045edfcbc7", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "762c0093-2277-4f3e-ac00-932277af3e0e", - "type": "UncategorizedText" - }, - { - "element_id": "60f6854bdd5c1362123b707b3836ec5a", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Blood Mask", - "type": "Title" - }, - { - "element_id": "459e0f0a9ce96831ae4f91ae912a5f25", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "1636744", - "type": "UncategorizedText" - }, - { - "element_id": "9d81ee525c3fa60e79474d5159a9ac2f", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2007-06-02", - "type": "UncategorizedText" - }, - { - "element_id": "4ca9dfe7de57c9d71f688687cccc36ec", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "The Foywonder", - "type": "Title" - }, - { - "element_id": "db5c57ce2da3356bac0a200350e5fa99", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "1/5", - "type": "UncategorizedText" - }, - { - "element_id": "4810fb7d8df8d65346f039b7ca93b70c", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "rotten", - "type": "NarrativeText" - }, - { - "element_id": "8077ef7b087ff1f0c278bc1145868240", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "It doesn't matter if a movie costs 300 million or only 300 dollars; good is good and bad is bad, and Bloodmask: The Possession of Nicole Lameroux is just plain bad.", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/ae40df94-0b3a-4f89-80df-940b3a6f8966.json b/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/ae40df94-0b3a-4f89-80df-940b3a6f8966.json deleted file mode 100644 index c54e666db3..0000000000 --- a/test_unstructured_ingest/expected-structured-output/astradb/ingest_test_src/ae40df94-0b3a-4f89-80df-940b3a6f8966.json +++ /dev/null @@ -1,146 +0,0 @@ -[ - { - "element_id": "92ace4ff9ad3621da892886eeee478a3", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "ae40df94", - "type": "Title" - }, - { - "element_id": "047ae5e12572a60afb92f698bb8e6f66", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "0b3a", - "type": "Title" - }, - { - "element_id": "31ef877bce804ebf1a15891edc11e2c7", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "4f89", - "type": "UncategorizedText" - }, - { - "element_id": "615b321c09ab3067ff128b2e4bcdcdf2", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "80df", - "type": "Title" - }, - { - "element_id": "1cb8aeff17aa47f2ef832d0f49165777", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "940b3a6f8966", - "type": "UncategorizedText" - }, - { - "element_id": "80d611a9e1670f65498590c4c3b33233", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Dangerous Men", - "type": "Title" - }, - { - "element_id": "2706f30db2cf0f66f927c9c536ac2518", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2504681", - "type": "UncategorizedText" - }, - { - "element_id": "a2346ae79f2c5908ef90122d2f703687", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "2018", - "type": "UncategorizedText" - }, - { - "element_id": "016c8949614d0fec6481738d0c8b45a5", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "08", - "type": "UncategorizedText" - }, - { - "element_id": "b963d1cd3855ec6d3a2c993304aee5f7", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "29", - "type": "UncategorizedText" - }, - { - "element_id": "06b0f5c271b3cb3d036fd89bd8979b38", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "Pat Padua", - "type": "Title" - }, - { - "element_id": "5f75100c8d5f9fee4007fd126d3c88f1", - "metadata": { - "data_source": {}, - "filetype": "text/plain", - "languages": [ - "eng" - ] - }, - "text": "fresh Its clumsy determination is endearing and sometimes wildly entertaining", - "type": "NarrativeText" - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 484b099f94..0cd04bffdc 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -7,6 +7,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 0, + "text": "Data in Brief 22 ( 2019 ) 451 – 457", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + } + ], "page_number": 1 }, "text": "Data in Brief 22 (2019) 451–457", @@ -33,6 +40,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 28, + "text": "ScienceDirect", + "url": "www.sciencedirect.com/science/journal/23523409" + } + ], "page_number": 1 }, "text": "Contents lists available at ScienceDirect", @@ -59,6 +73,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 18, + "text": "www . elsevier . com / locate / dib", + "url": "www.elsevier.com/locate/dib" + } + ], "page_number": 1 }, "text": "journal homepage: www.elsevier.com/locate/dib", @@ -293,6 +314,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 16, + "text": "tayo . sanni @ yahoo . com", + "url": "mailto:tayo.sanni@yahoo.com" + } + ], "page_number": 1 }, "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za E-mail address: tayo.sanni@yahoo.com (O. Sanni).", @@ -306,6 +334,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 0, + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + }, + { + "start_index": 0, + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + }, + { + "start_index": 0, + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 . 134", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + } + ], "page_number": 1 }, "text": "https://doi.org/10.1016/j.dib.2018.11.134", @@ -1177,6 +1222,18 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 89, + "text": "https :// doi .", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + }, + { + "start_index": 0, + "text": "org / 10 . 1016 / j . dib . 2018 . 11 . 134", + "url": "https://doi.org/10.1016/j.dib.2018.11.134" + } + ], "page_number": 7 }, "text": "Transparency document associated with this article can be found in the online version at https://doi. org/10.1016/j.dib.2018.11.134.", @@ -1203,6 +1260,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1" + }, + { + "start_index": 0, + "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1" + }, + { + "start_index": 0, + "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 – 230 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1" + } + ], "page_number": 7 }, "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results Phys. 9 (2018) 225–230.", @@ -1216,6 +1290,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov , Constitutive modeling for prediction of optimal process parameters in corrosion", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2" + }, + { + "start_index": 0, + "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2" + }, + { + "start_index": 0, + "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 – 15 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2" + } + ], "page_number": 7 }, "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15.", @@ -1229,6 +1320,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3" + }, + { + "start_index": 0, + "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3" + }, + { + "start_index": 0, + "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 – 468 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3" + } + ], "page_number": 7 }, "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel corrosion in chloride solution, Def. Technol. 14 (2018) 463–468.", @@ -1242,6 +1350,28 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 233, + "text": "https", + "url": "https://doi.org/10.1007/s13632-018-0495-5" + }, + { + "start_index": 233, + "text": "https", + "url": "https://doi.org/10.1007/s13632-018-0495-5" + }, + { + "start_index": 233, + "text": "https :// doi . org / 10 . 1007 /", + "url": "https://doi.org/10.1007/s13632-018-0495-5" + }, + { + "start_index": 258, + "text": "s13632 - 018 - 0495 - 5", + "url": "https://doi.org/10.1007/s13632-018-0495-5" + } + ], "page_number": 7 }, "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5.", @@ -1255,6 +1385,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 202, + "text": "https :// doi . org / 10 . 7449 / 2018 / MST _ 2018 _ 254 _ 261", + "url": "https://doi.org/10.7449/2018/MST_2018_254_261" + } + ], "page_number": 7 }, "text": "[5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. 〈https://doi.org/10.7449/2018/MST_2018_254_261〉.", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 5a25c95e60..63b2ca0fb5 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -7,6 +7,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 0, + "text": "Data in Brief 22 ( 2019 ) 484 – 487", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + } + ], "page_number": 1 }, "text": "Data in Brief 22 (2019) 484–487", @@ -33,6 +40,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 28, + "text": "ScienceDirect", + "url": "www.sciencedirect.com/science/journal/23523409" + } + ], "page_number": 1 }, "text": "Contents lists available at ScienceDirect", @@ -59,6 +73,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 18, + "text": "www . elsevier . com / locate / dib", + "url": "www.elsevier.com/locate/dib" + } + ], "page_number": 1 }, "text": "journal homepage: www.elsevier.com/locate/dib", @@ -332,6 +353,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 25, + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007 n Corresponding author at", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007" + }, + { + "start_index": 25, + "text": "https", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007" + }, + { + "start_index": 25, + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007", + "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007" + } + ], "page_number": 1 }, "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007", @@ -345,6 +383,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 16, + "text": "sarangkulkarni @ iitb . ac . in", + "url": "mailto:sarangkulkarni@iitb.ac.in" + } + ], "page_number": 1 }, "text": "n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India. E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni).", @@ -358,6 +403,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 0, + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + }, + { + "start_index": 0, + "text": "https", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + }, + { + "start_index": 0, + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 . 055", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + } + ], "page_number": 1 }, "text": "https://doi.org/10.1016/j.dib.2018.12.055", @@ -410,6 +472,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 386, + "text": "https :// orlib . uqcloud . net /", + "url": "https://orlib.uqcloud.net/" + } + ], "page_number": 2 }, "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data Tables, text files How data were acquired Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Data format Raw Experimental factors Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Experimental features Randomly generated instances Data source location IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data accessibility Data can be downloaded from https://orlib.uqcloud.net/ Related research article Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3].", @@ -514,6 +583,13 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 611, + "text": "https :// orlib . uqcloud . net", + "url": "https://orlib.uqcloud.net" + } + ], "page_number": 2 }, "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm; nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:", @@ -930,6 +1006,18 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 89, + "text": "https :// doi .", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + }, + { + "start_index": 0, + "text": "org / 10 . 1016 / j . dib . 2018 . 12 . 055", + "url": "https://doi.org/10.1016/j.dib.2018.12.055" + } + ], "page_number": 4 }, "text": "Transparency document associated with this article can be found in the online version at https://doi. org/10.1016/j.dib.2018.12.055.", @@ -956,6 +1044,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth , A branch and bound algorithm for the multiple depot vehicle scheduling", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1" + }, + { + "start_index": 0, + "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1" + }, + { + "start_index": 0, + "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 – 548 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1" + } + ], "page_number": 4 }, "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling problem, Networks 19 (5) (1989) 531–548.", @@ -969,6 +1074,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "N . Kliewer , T . Mellouli , L . Suhl , A time – space network based exact optimization model for multi - depot bus scheduling , Eur .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2" + }, + { + "start_index": 0, + "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2" + }, + { + "start_index": 0, + "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 – 1627 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2" + } + ], "page_number": 4 }, "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur. J. Oper. Res. 175 (3) (2006) 1616–1627.", @@ -982,6 +1104,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , A new formulation and a column generation - based heuristic", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3" + }, + { + "start_index": 0, + "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3" + }, + { + "start_index": 0, + "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 – 487 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3" + } + ], "page_number": 4 }, "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487.", @@ -995,6 +1134,18 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman , A comparison of five heuristics for the multiple depot vehicle scheduling", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4" + }, + { + "start_index": 0, + "text": "problem , J . Sched . 12 ( 1 ) ( 2009 ) 17 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4" + } + ], "page_number": 4 }, "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling problem, J. Sched. 12 (1) (2009) 17.", @@ -1008,6 +1159,23 @@ "languages": [ "eng" ], + "links": [ + { + "start_index": 4, + "text": "C . C . Ribeiro , F . Soumis , A column generation approach to the multiple - depot vehicle scheduling problem , Oper . Res . 42 ( 1 )", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5" + }, + { + "start_index": 0, + "text": "( 1994 ) 41 –", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5" + }, + { + "start_index": 0, + "text": "( 1994 ) 41 – 52 .", + "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5" + } + ], "page_number": 4 }, "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) (1994) 41–52.", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 66e1dbea73..30ab76d6f9 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -312,6 +312,13 @@ "element_id": "f1169388c7749db52e388e2fe4feaec6", "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model con\ufb01gurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going e\ufb00orts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.", "metadata": { + "links": [ + { + "text": "https :// layout - parser . github . io", + "url": "https://layout-parser.github.io", + "start_index": 1472 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -378,6 +385,13 @@ "element_id": "8de96d1e80af35f9b6954252e14c2caf", "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\ufb01cation [11,", "metadata": { + "links": [ + { + "text": "11", + "url": "cite.harley2015evaluation", + "start_index": 156 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -422,6 +436,33 @@ "element_id": "4b097cc42d7d30e720512dbce0cb4905", "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual speci\ufb01cation of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and bene\ufb01t a broad spectrum of large-scale document digitization projects.", "metadata": { + "links": [ + { + "text": "37", + "url": "cite.xu2019layoutlm", + "start_index": 0 + }, + { + "text": "38", + "url": "cite.zhong2019publaynet", + "start_index": 23 + }, + { + "text": "22", + "url": "cite.oliveira2018dhsegment", + "start_index": 27 + }, + { + "text": "26", + "url": "cite.prasad2020cascadetabnet", + "start_index": 49 + }, + { + "text": "4", + "url": "cite.baek2019character", + "start_index": 80 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -444,6 +485,23 @@ "element_id": "45844a4901777afaf6de9a0994e017eb", "text": "However, there are several practical di\ufb03culties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would bene\ufb01t the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-\ufb02edged infrastructure for easily curating the target document image datasets and \ufb01ne-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the \ufb01nal outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it di\ufb03cult for research teams to learn about how full pipelines are implemented and leads them to invest signi\ufb01cant resources in reinventing the DIA wheel.", "metadata": { + "links": [ + { + "text": "1", + "url": "cite.tensorflow2015-whitepaper", + "start_index": 252 + }, + { + "text": "24", + "url": "cite.paszke2019pytorch", + "start_index": 267 + }, + { + "text": "8", + "url": "cite.gardner2018allennlp", + "start_index": 347 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -598,6 +656,23 @@ "element_id": "583775f22c8080098beebbef960e2fbf", "text": "LayoutParser is well aligned with recent e\ufb00orts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects", "metadata": { + "links": [ + { + "text": "8", + "url": "cite.gardner2018allennlp", + "start_index": 138 + }, + { + "text": "34", + "url": "cite.wolf2019huggingface", + "start_index": 141 + }, + { + "text": "35", + "url": "cite.wu2019detectron2", + "start_index": 168 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -642,6 +717,23 @@ "element_id": "50846086f4d9ece02052735686278699", "text": "that require precision, e\ufb03ciency, and robustness, as well as simple and light- weight document processing tasks focusing on e\ufb03cacy and \ufb02exibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.", "metadata": { + "links": [ + { + "text": "5", + "url": "section.1.5", + "start_index": 155 + }, + { + "text": "37", + "url": "cite.xu2019layoutlm", + "start_index": 301 + }, + { + "text": "34", + "url": "cite.wolf2019huggingface", + "start_index": 305 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -664,6 +756,33 @@ "element_id": "0ce686208eb4aba70d0cd053d50c7bc2", "text": "The rest of the paper is organized as follows. Section 2 provides an overview of related work. The core LayoutParser library, DL Model Zoo, and customized model training are described in Section 3, and the DL model hub and commu- nity platform are detailed in Section 4. Section 5 shows two examples of how LayoutParser can be used in practical DIA projects, and Section 6 concludes.", "metadata": { + "links": [ + { + "text": "2", + "url": "section.1.2", + "start_index": 55 + }, + { + "text": "3", + "url": "section.1.3", + "start_index": 195 + }, + { + "text": "4", + "url": "section.1.4", + "start_index": 268 + }, + { + "text": "5", + "url": "section.1.5", + "start_index": 279 + }, + { + "text": "6", + "url": "section.1.6", + "start_index": 371 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -708,6 +827,53 @@ "element_id": "8153390c1bb8652313be64034531449e", "text": "Recently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no uni\ufb01ed framework to load and use such models.", "metadata": { + "links": [ + { + "text": "22", + "url": "cite.oliveira2018dhsegment", + "start_index": 103 + }, + { + "text": "20", + "url": "cite.long2015fully", + "start_index": 146 + }, + { + "text": "28", + "url": "cite.ren2015faster", + "start_index": 250 + }, + { + "text": "12", + "url": "cite.he2017mask", + "start_index": 270 + }, + { + "text": "38", + "url": "cite.zhong2019publaynet", + "start_index": 318 + }, + { + "text": "30", + "url": "cite.schreiber2017deepdesrt", + "start_index": 344 + }, + { + "text": "26", + "url": "cite.prasad2020cascadetabnet", + "start_index": 348 + }, + { + "text": "29", + "url": "cite.scarselli2008graph", + "start_index": 391 + }, + { + "text": "27", + "url": "cite.qasim2019rethinking", + "start_index": 435 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -730,6 +896,63 @@ "element_id": "45d6d494603e84706884918c1f785c9f", "text": "There has been a surge of interest in creating open-source tools for document image processing: a search of document image analysis in Github leads to 5M relevant code pieces 6; yet most of them rely on traditional rule-based methods or provide limited functionalities. The closest prior research to our work is the OCR-D project7, which also tries to build a complete toolkit for DIA. However, similar to the platform developed by Neudecker et al. [21], it is designed for analyzing historical documents, and provides no supports for recent DL models. The DocumentLayoutAnalysis project8 focuses on processing born-digital PDF documents via analyzing the stored PDF data. Repositories like DeepLayout9 and Detectron2-PubLayNet10 are individual deep learning models trained on layout analysis datasets without support for the full DIA pipeline. The Document Analysis and Exploitation (DAE) platform [15] and the DeepDIVA project [2] aim to improve the reproducibility of DIA methods (or DL models), yet they are not actively maintained. OCR engines like Tesseract [14], easyOCR11 and paddleOCR12 usually do not come with comprehensive functionalities for other DIA tasks like layout analysis.", "metadata": { + "links": [ + { + "text": "6 ;", + "url": "Hfootnote.1", + "start_index": 175 + }, + { + "text": ",", + "url": "Hfootnote.2", + "start_index": 330 + }, + { + "text": "21", + "url": "cite.neudecker2011experimental", + "start_index": 450 + }, + { + "text": "focuses", + "url": "Hfootnote.3", + "start_index": 589 + }, + { + "text": "on", + "url": "Hfootnote.4", + "start_index": 774 + }, + { + "text": "stored PDF data . Repositories like DeepLayout9 and Detectron2 - PubLayNet10", + "url": "Hfootnote.5", + "start_index": 656 + }, + { + "text": "15", + "url": "cite.lamiroy2011open", + "start_index": 900 + }, + { + "text": "2", + "url": "cite.alberti2018deepdiva", + "start_index": 930 + }, + { + "text": "14", + "url": "cite.tesseract", + "start_index": 1065 + }, + { + "text": "and", + "url": "Hfootnote.6", + "start_index": 1080 + }, + { + "text": "usually", + "url": "Hfootnote.7", + "start_index": 1096 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -752,6 +975,13 @@ "element_id": "73feaff827cbc7089d3f95d1e5aac6aa", "text": "Recent years have also seen numerous e\ufb00orts to create libraries for promoting reproducibility and reusability in the \ufb01eld of DL. Libraries like Dectectron2 [35],", "metadata": { + "links": [ + { + "text": "35", + "url": "cite.wu2019detectron2", + "start_index": 157 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1016,6 +1246,28 @@ "element_id": "b4948db85ca791e99aa92589fc41734f", "text": "AllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes speci\ufb01cally in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.", "metadata": { + "links": [ + { + "text": "8", + "url": "cite.gardner2018allennlp", + "start_index": 10 + }, + { + "text": "34", + "url": "cite.wolf2019huggingface", + "start_index": 31 + }, + { + "text": "23", + "url": "cite.paszke2017automatic", + "start_index": 381 + }, + { + "text": "1", + "url": "cite.tensorflow2015-whitepaper", + "start_index": 405 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1038,6 +1290,38 @@ "element_id": "7651db80014a85ab253367d3bd3e4f88", "text": "There have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper \ufb01gure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support di\ufb00erent use cases.", "metadata": { + "links": [ + { + "text": "3", + "url": "cite.antonacopoulos2009realistic", + "start_index": 128 + }, + { + "text": "38", + "url": "cite.zhong2019publaynet", + "start_index": 161 + }, + { + "text": "18", + "url": "cite.li2019tablebank", + "start_index": 202 + }, + { + "text": "16", + "url": "cite.newspaper_navigator_search_application", + "start_index": 263 + }, + { + "text": "17", + "url": "cite.newspaper_navigator_dataset", + "start_index": 267 + }, + { + "text": "31", + "url": "cite.shen2020large", + "start_index": 311 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1148,6 +1432,33 @@ "element_id": "cb534ba64da736dc53d60b660f5e1153", "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scienti\ufb01c documents PRImA [3] M - Layouts of scanned modern magazines and scienti\ufb01c reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scienti\ufb01c and business document HJDataset [31] F / M - Layouts of history Japanese documents", "metadata": { + "links": [ + { + "text": "[ 38 ]", + "url": "cite.zhong2019publaynet", + "start_index": 10 + }, + { + "text": "[ 3 ]", + "url": "cite.antonacopoulos2009realistic", + "start_index": 21 + }, + { + "text": "[ 17 ]", + "url": "cite.newspaper_navigator_dataset", + "start_index": 35 + }, + { + "text": "[ 18 ]", + "url": "cite.li2019tablebank", + "start_index": 50 + }, + { + "text": "[ 31 ]", + "url": "cite.shen2020large", + "start_index": 65 + } + ], "text_as_html": "
Dataset| Base Model'|| Notes
PubLayNet B8]|F/MLayouts of modern scientific documents
PRImAMLayouts of scanned modern magazines and scientific report
NewspaperFLayouts of scanned US newspapers from the 20th century
TableBankFTable region on modern scientific and business document
HJDatasetF/MLayouts of history Japanese documents
", "filetype": "application/pdf", "languages": [ @@ -1171,6 +1482,23 @@ "element_id": "f978160527177fa39c13774ec8dfa9cb", "text": "1 For each dataset, we train several models of di\ufb00erent sizes for di\ufb00erent needs (the trade-o\ufb00 between accuracy vs. computational cost). For \u201cbase model\u201d and \u201clarge model\u201d, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of di\ufb00erent architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", "metadata": { + "links": [ + { + "text": "[ 13", + "url": "cite.he2016deep", + "start_index": 229 + }, + { + "text": "[ 28 ]", + "url": "cite.ren2015faster", + "start_index": 315 + }, + { + "text": "[ 12 ]", + "url": "cite.he2017mask", + "start_index": 339 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1237,6 +1565,23 @@ "element_id": "bbcc10c2b92de0cbdce8629f18b0d7ad", "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Di\ufb00erent from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:", "metadata": { + "links": [ + { + "text": "28", + "url": "cite.ren2015faster", + "start_index": 383 + }, + { + "text": "12", + "url": "cite.he2017mask", + "start_index": 403 + }, + { + "text": "35", + "url": "cite.wu2019detectron2", + "start_index": 588 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1347,6 +1692,13 @@ "element_id": "f888c5e8f5b1339f2af75612ea13c719", "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", "metadata": { + "links": [ + { + "text": "7", + "url": "cite.ganin2015unsupervised", + "start_index": 167 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1457,6 +1809,13 @@ "element_id": "d997f63fd79c7e03050ca01b58dfdf0a", "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 di\ufb00erent datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).", "metadata": { + "links": [ + { + "text": "1", + "url": "table.caption.3", + "start_index": 15 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1545,6 +1904,23 @@ "element_id": "dcdc0dc4759bd20c04026973cbe386e2", "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be speci\ufb01ed and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.", "metadata": { + "links": [ + { + "text": "2", + "url": "figure.caption.5", + "start_index": 164 + }, + { + "text": "2", + "url": "table.caption.6", + "start_index": 1062 + }, + { + "text": ".", + "url": "Hfootnote.8", + "start_index": 1117 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1677,6 +2053,18 @@ "element_id": "fa023ccf2ac1042ef254ecf47cc592ca", "text": "LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classi\ufb01cation (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.", "metadata": { + "links": [ + { + "text": "6", + "url": "cite.deng2017image", + "start_index": 59 + }, + { + "text": "10", + "url": "cite.graves2006connectionist", + "start_index": 128 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1810,6 +2198,28 @@ "element_id": "afa5f1dc8b4ce5598f278992d818eaa9", "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into di\ufb00erent formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speci\ufb01c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).", "metadata": { + "links": [ + { + "text": "14", + "url": "Hfootnote.9", + "start_index": 230 + }, + { + "text": "38", + "url": "cite.zhong2019publaynet", + "start_index": 309 + }, + { + "text": "25", + "url": "cite.pletschacher2010page", + "start_index": 334 + }, + { + "text": "3 . 5", + "url": "subsection.1.3.5", + "start_index": 374 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -1832,6 +2242,13 @@ "element_id": "28aeb996f497c9d01d06e564483d0854", "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in di\ufb00erent modes. More detailed information can be found in the online LayoutParser documentation page.", "metadata": { + "links": [ + { + "text": "3", + "url": "figure.caption.8", + "start_index": 614 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2008,6 +2425,13 @@ "element_id": "a3498730b5cd3fe9405fad69bcf37882", "text": "LayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high con\ufb01dence predictions from the layout detection model. This allows a layout dataset to be created more e\ufb03ciently with only around 60% of the labeling budget.", "metadata": { + "links": [ + { + "text": "32", + "url": "cite.shen2020olala", + "start_index": 116 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2030,6 +2454,18 @@ "element_id": "c4ccf2cf2e7495668221cbe51534f90b", "text": "After the training dataset is curated, LayoutParser supports di\ufb00erent modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are signi\ufb01cantly di\ufb00erent and a large training set is available. However, as suggested in Studer et al.\u2019s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally di\ufb00erent domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.", "metadata": { + "links": [ + { + "text": "33", + "url": "cite.studer2019comprehensive", + "start_index": 429 + }, + { + "text": "5", + "url": "cite.imagenet_cvpr09", + "start_index": 501 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2140,6 +2576,13 @@ "element_id": "e9a86eb57ba5483acfeefb0e931402b1", "text": "Another focus of LayoutParser is promoting the reusability of layout detection models and full digitization pipelines. Similar to many existing deep learning libraries, LayoutParser comes with a community model hub for distributing layout models. End-users can upload their self-trained models to the model hub, and these models can be loaded into a similar interface as the currently available LayoutParser pre-trained models. For example, the model trained on the News Navigator dataset [17] has been incorporated in the model hub.", "metadata": { + "links": [ + { + "text": "17", + "url": "cite.newspaper_navigator_dataset", + "start_index": 490 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2316,6 +2759,13 @@ "element_id": "76dd07abeb9f4bbcb77152deb52c9dc0", "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese \ufb01rm \ufb01nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify di\ufb00erent levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.", "metadata": { + "links": [ + { + "text": "5", + "url": "figure.caption.10", + "start_index": 432 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2338,6 +2788,18 @@ "element_id": "42551c9b40827dcdc52055b4d25c6fc3", "text": "As shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identi\ufb01ed via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.", "metadata": { + "links": [ + { + "text": "4", + "url": "figure.caption.9", + "start_index": 19 + }, + { + "text": "15 ,", + "url": "Hfootnote.10", + "start_index": 83 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2470,6 +2932,18 @@ "element_id": "7e1f7b138c864ed8b40cf0f3d38801ec", "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identi\ufb01ed and recti\ufb01ed via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.", "metadata": { + "links": [ + { + "text": "32", + "url": "cite.shen2020olala", + "start_index": 249 + }, + { + "text": "19", + "url": "cite.lin2014microsoft", + "start_index": 794 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2492,6 +2966,13 @@ "element_id": "dccaa93e7bae24dedf523dd39575dfbe", "text": "A combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The \ufb02exible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.", "metadata": { + "links": [ + { + "text": "4", + "url": "figure.caption.9", + "start_index": 485 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2514,6 +2995,23 @@ "element_id": "60c2e2147d0b0dbd576d51b71a95a2ef", "text": "Additionally, it is common for historical documents to use unique fonts with di\ufb00erent glyphs, which signi\ufb01cantly degrades the accuracy of OCR models trained on modern texts. In this document, a special \ufb02at font is used for printing numbers and could not be detected by o\ufb00-the-shelf OCR engines. Using the highly \ufb02exible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal e\ufb00ort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identi\ufb01es characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.", "metadata": { + "links": [ + { + "text": "6", + "url": "cite.deng2017image", + "start_index": 766 + }, + { + "text": "model detects a total of 15 possible categories , and achieves a 0 . 98 Jaccard score16", + "url": "Hfootnote.11", + "start_index": 774 + }, + { + "text": "for", + "url": "Hfootnote.12", + "start_index": 901 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2690,6 +3188,28 @@ "element_id": "445ad333fa3f7f85d2be634fbdeeb72a", "text": "Detecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal e\ufb00ort.", "metadata": { + "links": [ + { + "text": "26", + "url": "cite.prasad2020cascadetabnet", + "start_index": 152 + }, + { + "text": "30", + "url": "cite.schreiber2017deepdesrt", + "start_index": 156 + }, + { + "text": "27", + "url": "cite.qasim2019rethinking", + "start_index": 160 + }, + { + "text": "18", + "url": "Hfootnote.13", + "start_index": 174 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2712,6 +3232,23 @@ "element_id": "923b62eb8550ec49cf6d3f2e6bac7ec8", "text": "The extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By \ufb01ltering out model predictions of low con\ufb01dence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which signi\ufb01cantly simpli\ufb01es the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at di\ufb00erent positions on a page accurately. Continued tables from di\ufb00erent pages are concatenated, and a structured table representation has been easily created.", "metadata": { + "links": [ + { + "text": "12", + "url": "cite.he2017mask", + "start_index": 179 + }, + { + "text": "38", + "url": "cite.zhong2019publaynet", + "start_index": 217 + }, + { + "text": "6", + "url": "figure.caption.11", + "start_index": 957 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2800,6 +3337,23 @@ "element_id": "e79cef57c86050aa5fc74e5cd3923197", "text": "LayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The o\ufb00-the-shelf library is easy to install, and can be used to build \ufb02exible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.", "metadata": { + "links": [ + { + "text": "37", + "url": "cite.xu2019layoutlm", + "start_index": 709 + }, + { + "text": "36", + "url": "cite.xu2020layoutlmv2", + "start_index": 713 + }, + { + "text": "9", + "url": "cite.garncarek2020lambert", + "start_index": 717 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -2866,6 +3420,13 @@ "element_id": "85e09a5617e58a3a78b22fd12eb29eaf", "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00b4e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00b4egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensor\ufb02ow.org", "metadata": { + "links": [ + { + "text": "https :// www . tensorflow . org /,", + "url": "https://www.tensorflow.org/", + "start_index": 568 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -3218,6 +3779,23 @@ "element_id": "07cef8a161dd1c3f0895c605844d678e", "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120\u2013122. UIST \u201920 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143", "metadata": { + "links": [ + { + "text": "https :// doi . org / 10 . 1145 / 3379350 . 3416143", + "url": "https://doi.org/10.1145/3379350.3416143", + "start_index": 288 + }, + { + "text": "https :// doi - org . offcampus .", + "url": "https://doi-org.offcampus.lib.washington.edu/10.1145/3379350.3416143", + "start_index": 329 + }, + { + "text": "lib . washington . edu / 10 . 1145 / 3379350 . 3416143", + "url": "https://doi-org.offcampus.lib.washington.edu/10.1145/3379350.3416143", + "start_index": 356 + } + ], "filetype": "application/pdf", "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json index 5c4a058d2a..6264f96a86 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json @@ -9,7 +9,7 @@ ], "links": [ { - "text": "DatainBrief22 ( 2019 ) 451 \u2013", + "text": "Data in Brief 22 ( 2019 ) 451 \u2013 457", "url": "https://doi.org/10.1016/j.dib.2018.11.134", "start_index": 0 } @@ -38,9 +38,9 @@ ], "links": [ { - "text": "", + "text": "ScienceDirect", "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": -1 + "start_index": 28 } ], "page_number": 1, @@ -89,7 +89,7 @@ ], "links": [ { - "text": "www . elsevier . com / locate /", + "text": "www . elsevier . com / locate / dib", "url": "www.elsevier.com/locate/dib", "start_index": 18 } @@ -487,7 +487,7 @@ "start_index": 0 }, { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 .", + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 . 134", "url": "https://doi.org/10.1016/j.dib.2018.11.134", "start_index": 0 } @@ -3838,9 +3838,9 @@ ], "links": [ { - "text": ":// doi", + "text": "https :// doi .", "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 94 + "start_index": 89 } ], "page_number": 7, @@ -3918,7 +3918,7 @@ ], "links": [ { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi ,", + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", "start_index": 4 } @@ -3947,12 +3947,12 @@ ], "links": [ { - "text": "usingeco - friendlywasteproduct , ResultsPhys . 9 ( 2018 ) 225 \u2013", + "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", "start_index": 0 }, { - "text": "usingeco - friendlywasteproduct , ResultsPhys . 9 ( 2018 ) 225 \u2013 230", + "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013 230 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", "start_index": 0 } @@ -3981,7 +3981,7 @@ ], "links": [ { - "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov ,", + "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov , Constitutive modeling for prediction of optimal process parameters in corrosion", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", "start_index": 4 } @@ -4010,12 +4010,12 @@ ], "links": [ { - "text": "inhibitionofausteniticstainlesssteel ( Type316 )/ acidicmedium , Mater . Res . Express . 5 ( 10 )( 2018 ) 1 \u2013", + "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", "start_index": 0 }, { - "text": "inhibitionofausteniticstainlesssteel ( Type316 )/ acidicmedium , Mater . Res . Express . 5 ( 10 )( 2018 ) 1 \u2013 15", + "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013 15 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", "start_index": 0 } @@ -4044,7 +4044,7 @@ ], "links": [ { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi ,", + "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", "start_index": 4 } @@ -4073,12 +4073,12 @@ ], "links": [ { - "text": "corrosioninchloridesolution , Def . Technol . 14 ( 2018 ) 463 \u2013", + "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", "start_index": 0 }, { - "text": "corrosioninchloridesolution , Def . Technol . 14 ( 2018 ) 463 \u2013 468", + "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013 468 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", "start_index": 0 } @@ -4117,7 +4117,7 @@ "start_index": 233 }, { - "text": "https :// doi . org / 10 . 1007", + "text": "https :// doi . org / 10 . 1007 /", "url": "https://doi.org/10.1007/s13632-018-0495-5", "start_index": 233 }, diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json index d649cb4330..26955e33e1 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json @@ -9,7 +9,7 @@ ], "links": [ { - "text": "DatainBrief22 ( 2019 ) 484 \u2013", + "text": "Data in Brief 22 ( 2019 ) 484 \u2013 487", "url": "https://doi.org/10.1016/j.dib.2018.12.055", "start_index": 0 } @@ -38,9 +38,9 @@ ], "links": [ { - "text": "", + "text": "ScienceDirect", "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": -1 + "start_index": 28 } ], "page_number": 1, @@ -89,7 +89,7 @@ ], "links": [ { - "text": "www . elsevier . com / locate /", + "text": "www . elsevier . com / locate / dib", "url": "www.elsevier.com/locate/dib", "start_index": 18 } @@ -316,7 +316,7 @@ ], "links": [ { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . nCorrespondingauthorat", + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007 n Corresponding author at", "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", "start_index": 25 }, @@ -326,7 +326,7 @@ "start_index": 25 }, { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 .", + "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007", "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", "start_index": 25 } @@ -394,7 +394,7 @@ "start_index": 0 }, { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 .", + "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 . 055", "url": "https://doi.org/10.1016/j.dib.2018.12.055", "start_index": 0 } @@ -511,9 +511,9 @@ ], "links": [ { - "text": ".,", + "text": "https :// orlib . uqcloud . net /", "url": "https://orlib.uqcloud.net/", - "start_index": 444 + "start_index": 386 } ], "page_number": 2, @@ -782,9 +782,9 @@ ], "links": [ { - "text": "2500 , and3000 . size , \u00f0m ; n\u00de , \ufb01veinstancesareprovided . Thedatasetcanbedownloadedfromhttps :// orlib . uqcloud . net", + "text": "https :// orlib . uqcloud . net", "url": "https://orlib.uqcloud.net", - "start_index": 509 + "start_index": 611 } ], "page_number": 2, @@ -2109,9 +2109,9 @@ ], "links": [ { - "text": ":// doi", + "text": "https :// doi .", "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 94 + "start_index": 89 } ], "page_number": 4, @@ -2189,7 +2189,7 @@ ], "links": [ { - "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth ,", + "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth , A branch and bound algorithm for the multiple depot vehicle scheduling", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", "start_index": 4 } @@ -2218,12 +2218,12 @@ ], "links": [ { - "text": "problem , Networks19 ( 5 )( 1989 ) 531 \u2013", + "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", "start_index": 0 }, { - "text": "problem , Networks19 ( 5 )( 1989 ) 531 \u2013 548", + "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013 548 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", "start_index": 0 } @@ -2252,7 +2252,7 @@ ], "links": [ { - "text": "N . Kliewer , T . Mellouli , L . Suhl , Atime \u2013 spacenetworkbasedexactoptimizationmodelformulti - depotbusscheduling , Eur", + "text": "N . Kliewer , T . Mellouli , L . Suhl , A time \u2013 space network based exact optimization model for multi - depot bus scheduling , Eur .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", "start_index": 4 } @@ -2281,12 +2281,12 @@ ], "links": [ { - "text": "J . Oper . Res . 175 ( 3 )( 2006 ) 1616 \u2013", + "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", "start_index": 0 }, { - "text": "J . Oper . Res . 175 ( 3 )( 2006 ) 1616 \u2013 1627", + "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013 1627 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", "start_index": 0 } @@ -2315,7 +2315,7 @@ ], "links": [ { - "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , Anewformulationandacolumngeneration -", + "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , A new formulation and a column generation - based heuristic", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", "start_index": 4 } @@ -2344,12 +2344,12 @@ ], "links": [ { - "text": "forthemultipledepotvehicleschedulingproblem , Transp . Res . PartBMethodol . 118 ( 2018 ) 457 \u2013", + "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", "start_index": 0 }, { - "text": "forthemultipledepotvehicleschedulingproblem , Transp . Res . PartBMethodol . 118 ( 2018 ) 457 \u2013 487", + "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013 487 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", "start_index": 0 } @@ -2378,7 +2378,7 @@ ], "links": [ { - "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman ,", + "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman , A comparison of \ufb01ve heuristics for the multiple depot vehicle scheduling", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", "start_index": 4 } @@ -2407,7 +2407,7 @@ ], "links": [ { - "text": "problem , J . Sched . 12 ( 1 )( 2009 ) 17", + "text": "problem , J . Sched . 12 ( 1 ) ( 2009 ) 17 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", "start_index": 0 } @@ -2436,7 +2436,7 @@ ], "links": [ { - "text": "C . C . Ribeiro , F . Soumis , Acolumngenerationapproachtothemultiple - depotvehicleschedulingproblem , Oper . Res . 42 ( 1", + "text": "C . C . Ribeiro , F . Soumis , A column generation approach to the multiple - depot vehicle scheduling problem , Oper . Res . 42 ( 1 )", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", "start_index": 4 } @@ -2470,7 +2470,7 @@ "start_index": 0 }, { - "text": "( 1994 ) 41 \u2013 52", + "text": "( 1994 ) 41 \u2013 52 .", "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", "start_index": 0 } diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json index 81b6237a77..a6d316c09c 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/page-with-formula.pdf.json @@ -9,9 +9,9 @@ ], "links": [ { - "text": "Theseareconcatenatedandonceagainprojected , resultinginthefinalvalues , depictedinFigure2", + "text": "2", "url": "figure.2", - "start_index": 15 + "start_index": 116 } ], "page_number": 1, @@ -385,9 +385,9 @@ ], "links": [ { - "text": "\u2212\u221e) ofthesoftmaxwhichcorrespondtoillegalconnections . SeeFigure2", + "text": "2", "url": "figure.2", - "start_index": 347 + "start_index": 442 } ], "page_number": 1, diff --git a/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json index 85e7118d4f..4a64ad4882 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/page-with-formula.pdf.json @@ -4,6 +4,13 @@ "element_id": "7581b3e14a56c276896da707704c221e", "text": "output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2.", "metadata": { + "links": [ + { + "text": "2", + "url": "figure.2", + "start_index": 116 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -158,6 +165,23 @@ "element_id": "fd24bf7bf21b4aab2a36021f9ebb253b", "text": "\u2022 In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [38, 2, 9].", "metadata": { + "links": [ + { + "text": "38", + "url": "cite.wu2016google", + "start_index": 354 + }, + { + "text": "2", + "url": "cite.bahdanau2014neural", + "start_index": 358 + }, + { + "text": "9", + "url": "cite.JonasFaceNet2017", + "start_index": 361 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -202,6 +226,13 @@ "element_id": "41b9b9d2a4329a8f6075f4776403c2de", "text": "\u2022 Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to \u2212\u221e) all values in the input of the softmax which correspond to illegal connections. See Figure 2.", "metadata": { + "links": [ + { + "text": "2", + "url": "figure.2", + "start_index": 442 + } + ], "filetype": "application/pdf", "languages": [ "eng" @@ -334,6 +365,13 @@ "element_id": "ebdf8de46645084127f7ff7b24ed87e9", "text": "Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor- mation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [30]. In the embedding layers, we multiply those weights by \u221a dmodel.", "metadata": { + "links": [ + { + "text": "30", + "url": "cite.press2016using", + "start_index": 439 + } + ], "filetype": "application/pdf", "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index cad19494f6..602ae7b7f7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.4-dev2" # pragma: no cover +__version__ = "0.16.4" # pragma: no cover diff --git a/unstructured/partition/common/common.py b/unstructured/partition/common/common.py index 2605acb97e..267630a87b 100644 --- a/unstructured/partition/common/common.py +++ b/unstructured/partition/common/common.py @@ -21,18 +21,12 @@ ListItem, PageBreak, Text, - Title, ) from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE -from unstructured.partition.utils.constants import SORT_MODE_DONT, SORT_MODE_XY_CUT -from unstructured.utils import dependency_exists, first - -if dependency_exists("numpy") and dependency_exists("cv2"): - from unstructured.partition.utils.sorting import sort_page_elements if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout, PageLayout + from unstructured_inference.inference.layout import PageLayout from unstructured_inference.inference.layoutelement import LayoutElement @@ -406,7 +400,7 @@ def contains_emoji(s: str) -> bool: return bool(emoji.emoji_count(s)) -def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: +def get_page_image_metadata(page: PageLayout) -> dict[str, Any]: """Retrieve image metadata and coordinate system from a page.""" image = getattr(page, "image", None) @@ -432,113 +426,6 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]: } -# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in -# unstructured.documents.html, which imports this module so we can't import the class for type -# hints. Moreover, those two types of documents have different lists of attributes -# UPDATE(scanny): HTMLDocument no longer exists, so this function can be optimized for use by -# DocumentLayout only. -def document_to_element_list( - document: DocumentLayout, - sortable: bool = False, - include_page_breaks: bool = False, - last_modification_date: Optional[str] = None, - infer_list_items: bool = True, - source_format: Optional[str] = None, - detection_origin: Optional[str] = None, - sort_mode: str = SORT_MODE_XY_CUT, - languages: Optional[list[str]] = None, - starting_page_number: int = 1, - **kwargs: Any, -) -> list[Element]: - """Converts a DocumentLayout object to a list of unstructured elements.""" - elements: list[Element] = [] - - num_pages = len(document.pages) - for page_number, page in enumerate(document.pages, start=starting_page_number): - page_elements: list[Element] = [] - - page_image_metadata = _get_page_image_metadata(page) - image_format = page_image_metadata.get("format") - image_width = page_image_metadata.get("width") - image_height = page_image_metadata.get("height") - - translation_mapping: list[tuple["LayoutElement", Element]] = [] - for layout_element in page.elements: - if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): - coordinate_system = PixelSpace(width=image_width, height=image_height) - else: - coordinate_system = None - - element = normalize_layout_element( - layout_element, - coordinate_system=coordinate_system, - infer_list_items=infer_list_items, - source_format=source_format if source_format else "html", - ) - if isinstance(element, list): - for el in element: - if last_modification_date: - el.metadata.last_modified = last_modification_date - el.metadata.page_number = page_number - page_elements.extend(element) - translation_mapping.extend([(layout_element, el) for el in element]) - continue - else: - if last_modification_date: - element.metadata.last_modified = last_modification_date - element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) - element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) - # FIXME: here the elements in a page can be either: - # 1. LayoutElement if the document is LayoutDocument (if the partition is on a - # pdf/image); - # 2. Element if the document is HTMLDocument (if the partition is on an html file) - # this discrepency is due to Element class defined in unstructured and LayoutElement - # class defined in unstructured_inference do not have the same list of attributes - if (isinstance(element, Title) and element.metadata.category_depth is None) and any( - getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements - ): - element.metadata.category_depth = 0 - - page_elements.append(element) - translation_mapping.append((layout_element, element)) - coordinates = ( - element.metadata.coordinates.points if element.metadata.coordinates else None - ) - - el_image_path = ( - layout_element.image_path if hasattr(layout_element, "image_path") else None - ) - - add_element_metadata( - element, - page_number=page_number, - filetype=image_format, - coordinates=coordinates, - coordinate_system=coordinate_system, - category_depth=element.metadata.category_depth, - image_path=el_image_path, - detection_origin=detection_origin, - languages=languages, - **kwargs, - ) - - for layout_element, element in translation_mapping: - if hasattr(layout_element, "parent") and layout_element.parent is not None: - element_parent = first( - (el for l_el, el in translation_mapping if l_el is layout_element.parent), - ) - element.metadata.parent_id = element_parent.id - sorted_page_elements = page_elements - if sortable and sort_mode != SORT_MODE_DONT: - sorted_page_elements = sort_page_elements(page_elements, sort_mode) - - if include_page_breaks and page_number < num_pages + starting_page_number: - sorted_page_elements.append(PageBreak(text="")) - elements.extend(sorted_page_elements) - - return elements - - def ocr_data_to_elements( ocr_data: list["LayoutElement"], image_size: tuple[int | float, int | float], diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e7b5b516fa..f87812d40b 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -12,12 +12,13 @@ import numpy as np import wrapt from pdfminer import psparser -from pdfminer.layout import LTChar, LTContainer, LTImage, LTItem, LTTextBox -from pdfminer.pdftypes import PDFObjRef +from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox from pdfminer.utils import open_filename from pi_heif import register_heif_opener from PIL import Image as PILImage from pypdf import PdfReader +from unstructured_inference.inference.layout import DocumentLayout +from unstructured_inference.inference.layoutelement import LayoutElement from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import ( @@ -35,6 +36,7 @@ ListItem, PageBreak, Text, + Title, process_metadata, ) from unstructured.errors import PageCountExceededError @@ -43,8 +45,10 @@ from unstructured.logger import logger, trace_logger from unstructured.nlp.patterns import PARAGRAPH_PATTERN from unstructured.partition.common.common import ( - document_to_element_list, + add_element_metadata, exactly_one, + get_page_image_metadata, + normalize_layout_element, ocr_data_to_elements, spooled_to_bytes_io_if_needed, ) @@ -68,7 +72,12 @@ save_elements, ) from unstructured.partition.pdf_image.pdfminer_processing import ( + check_annotations_within_element, clean_pdfminer_inner_elements, + get_links_in_element, + get_uris, + get_words_from_obj, + map_bbox_and_index, merge_inferred_with_extracted_layout, ) from unstructured.partition.pdf_image.pdfminer_utils import ( @@ -88,7 +97,7 @@ ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements from unstructured.patches.pdfminer import parse_keyword -from unstructured.utils import requires_dependencies +from unstructured.utils import first, requires_dependencies if TYPE_CHECKING: pass @@ -450,7 +459,7 @@ def _process_pdfminer_pages( page_number, annotation_threshold, ) - _, words = get_word_bounding_box_from_element(obj, height) + _, words = get_words_from_obj(obj, height) for annot in annotations_within_element: urls_metadata.append(map_bbox_and_index(words, annot)) @@ -583,10 +592,10 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, ) - extracted_layout = ( + extracted_layout, layouts_links = ( process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi) if pdf_text_extractable - else [] + else ([], []) ) if analysis: @@ -636,8 +645,10 @@ def _partition_pdf_or_image_local( if hasattr(file, "seek"): file.seek(0) - extracted_layout = ( - process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) if pdf_text_extractable else [] + extracted_layout, layouts_links = ( + process_data_with_pdfminer(file=file, dpi=pdf_image_dpi) + if pdf_text_extractable + else ([], []) ) if analysis: @@ -696,6 +707,7 @@ def _partition_pdf_or_image_local( infer_list_items=False, languages=languages, starting_page_number=starting_page_number, + layouts_links=layouts_links, **kwargs, ) @@ -1076,323 +1088,111 @@ def check_coords_within_boundary( return x_within_boundary and y_within_boundary -def get_uris( - annots: PDFObjRef | list[PDFObjRef], - height: float, - coordinate_system: PixelSpace | PointSpace, - page_number: int, -) -> list[dict[str, Any]]: - """ - Extracts URI annotations from a single or a list of PDF object references on a specific page. - The type of annots (list or not) depends on the pdf formatting. The function detectes the type - of annots and then pass on to get_uris_from_annots function as a list. - - Args: - annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references - representing annotations on the page. - height (float): The height of the page in the specified coordinate system. - coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent - the annotations' coordinates. - page_number (int): The page number from which to extract annotations. - - Returns: - list[dict]: A list of dictionaries, each containing information about a URI annotation, - including its coordinates, bounding box, type, URI link, and page number. - """ - if isinstance(annots, list): - return get_uris_from_annots(annots, height, coordinate_system, page_number) - resolved_annots = annots.resolve() - if resolved_annots is None: - return [] - return get_uris_from_annots(resolved_annots, height, coordinate_system, page_number) - - -def get_uris_from_annots( - annots: list[PDFObjRef], - height: int | float, - coordinate_system: PixelSpace | PointSpace, - page_number: int, -) -> list[dict[str, Any]]: - """ - Extracts URI annotations from a list of PDF object references. - - Args: - annots (list[PDFObjRef]): A list of PDF object references representing annotations on - a page. - height (int | float): The height of the page in the specified coordinate system. - coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent - the annotations' coordinates. - page_number (int): The page number from which to extract annotations. - - Returns: - list[dict]: A list of dictionaries, each containing information about a URI annotation, - including its coordinates, bounding box, type, URI link, and page number. - """ - annotation_list = [] - for annotation in annots: - # Check annotation is valid for extraction - annotation_dict = try_resolve(annotation) - if not isinstance(annotation_dict, dict): - continue - subtype = annotation_dict.get("Subtype", None) - if not subtype or isinstance(subtype, PDFObjRef) or str(subtype) != "/'Link'": - continue - # Extract bounding box and update coordinates - rect = annotation_dict.get("Rect", None) - if not rect or isinstance(rect, PDFObjRef) or len(rect) != 4: - continue - x1, y1, x2, y2 = rect_to_bbox(rect, height) - points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) - coordinates_metadata = CoordinatesMetadata( - points=points, - system=coordinate_system, - ) - # Extract type - if "A" not in annotation_dict: - continue - uri_dict = try_resolve(annotation_dict["A"]) - if not isinstance(uri_dict, dict): - continue - uri_type = None - if "S" in uri_dict and not isinstance(uri_dict["S"], PDFObjRef): - uri_type = str(uri_dict["S"]) - # Extract URI link - uri = None - try: - if uri_type == "/'URI'": - uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8") - if uri_type == "/'GoTo'": - uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8") - except Exception: - pass - - annotation_list.append( - { - "coordinates": coordinates_metadata, - "bbox": (x1, y1, x2, y2), - "type": uri_type, - "uri": uri, - "page_number": page_number, - }, - ) - return annotation_list - - -def try_resolve(annot: PDFObjRef): - """ - Attempt to resolve a PDF object reference. If successful, returns the resolved object; - otherwise, returns the original reference. - """ - try: - return annot.resolve() - except Exception: - return annot - - -def calculate_intersection_area( - bbox1: tuple[float, float, float, float], - bbox2: tuple[float, float, float, float], -) -> float: - """ - Calculate the area of intersection between two bounding boxes. +def document_to_element_list( + document: DocumentLayout, + sortable: bool = False, + include_page_breaks: bool = False, + last_modification_date: Optional[str] = None, + infer_list_items: bool = True, + source_format: Optional[str] = None, + detection_origin: Optional[str] = None, + sort_mode: str = SORT_MODE_XY_CUT, + languages: Optional[list[str]] = None, + starting_page_number: int = 1, + layouts_links: Optional[list[list]] = None, + **kwargs: Any, +) -> list[Element]: + """Converts a DocumentLayout object to a list of unstructured elements.""" + elements: list[Element] = [] - Args: - bbox1 (tuple[float, float, float, float]): The coordinates of the first bounding box - in the format (x1, y1, x2, y2). - bbox2 (tuple[float, float, float, float]): The coordinates of the second bounding box - in the format (x1, y1, x2, y2). + num_pages = len(document.pages) + for page_number, page in enumerate(document.pages, start=starting_page_number): + page_elements: list[Element] = [] - Returns: - float: The area of intersection between the two bounding boxes. If there is no - intersection, the function returns 0.0. - """ - x1_1, y1_1, x2_1, y2_1 = bbox1 - x1_2, y1_2, x2_2, y2_2 = bbox2 + page_image_metadata = get_page_image_metadata(page) + image_format = page_image_metadata.get("format") + image_width = page_image_metadata.get("width") + image_height = page_image_metadata.get("height") - x_intersection = max(x1_1, x1_2) - y_intersection = max(y1_1, y1_2) - x2_intersection = min(x2_1, x2_2) - y2_intersection = min(y2_1, y2_2) + translation_mapping: list[tuple["LayoutElement", Element]] = [] - if x_intersection < x2_intersection and y_intersection < y2_intersection: - intersection_area = calculate_bbox_area( - (x_intersection, y_intersection, x2_intersection, y2_intersection), + links = ( + layouts_links[page_number - starting_page_number] + if layouts_links and layouts_links[0] + else None ) - return intersection_area - else: - return 0.0 - - -def calculate_bbox_area(bbox: tuple[float, float, float, float]) -> float: - """ - Calculate the area of a bounding box. - - Args: - bbox (tuple[float, float, float, float]): The coordinates of the bounding box - in the format (x1, y1, x2, y2). - Returns: - float: The area of the bounding box, computed as the product of its width and height. - """ - x1, y1, x2, y2 = bbox - area = (x2 - x1) * (y2 - y1) - return area - - -def check_annotations_within_element( - annotation_list: list[dict[str, Any]], - element_bbox: tuple[float, float, float, float], - page_number: int, - annotation_threshold: float, -) -> list[dict[str, Any]]: - """ - Filter annotations that are within or highly overlap with a specified element on a page. - - Args: - annotation_list (list[dict[str,Any]]): A list of dictionaries, each containing information - about an annotation. - element_bbox (tuple[float, float, float, float]): The bounding box coordinates of the - specified element in the bbox format (x1, y1, x2, y2). - page_number (int): The page number to which the annotations and element belong. - annotation_threshold (float, optional): The threshold value (between 0.0 and 1.0) - that determines the minimum overlap required for an annotation to be considered - within the element. Default is 0.9. - - Returns: - list[dict[str,Any]]: A list of dictionaries containing information about annotations - that are within or highly overlap with the specified element on the given page, based on - the specified threshold. - """ - annotations_within_element = [] - for annotation in annotation_list: - if annotation["page_number"] == page_number: - annotation_bbox_size = calculate_bbox_area(annotation["bbox"]) - if annotation_bbox_size and ( - calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size - > annotation_threshold - ): - annotations_within_element.append(annotation) - return annotations_within_element - - -def get_word_bounding_box_from_element( - obj: LTTextBox, - height: float, -) -> tuple[list[LTChar], list[dict[str, Any]]]: - """ - Extracts characters and word bounding boxes from a PDF text element. - - Args: - obj (LTTextBox): The PDF text element from which to extract characters and words. - height (float): The height of the page in the specified coordinate system. + for layout_element in page.elements: + if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): + coordinate_system = PixelSpace(width=image_width, height=image_height) + else: + coordinate_system = None - Returns: - tuple[list[LTChar], list[dict[str,Any]]]: A tuple containing two lists: - - list[LTChar]: A list of LTChar objects representing individual characters. - - list[dict[str,Any]]]: A list of dictionaries, each containing information about - a word, including its text, bounding box, and start index in the element's text. - """ - characters = [] - words = [] - text_len = 0 - - for text_line in obj: - word = "" - x1, y1, x2, y2 = None, None, None, None - start_index = 0 - for index, character in enumerate(text_line): - if isinstance(character, LTChar): - characters.append(character) - char = character.get_text() - - if word and not char.strip(): - words.append( - {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, - ) - word = "" - continue - - # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 - # will need to switch to some pattern matching once we support more languages - if not word: - isalnum = char.isalnum() - if word and char.isalnum() != isalnum: - isalnum = char.isalnum() - words.append( - {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, - ) - word = "" - - if len(word) == 0: - start_index = text_len + index - x1 = character.x0 - y2 = height - character.y0 - x2 = character.x1 - y1 = height - character.y1 - else: - x2 = character.x1 - y2 = height - character.y0 + element = normalize_layout_element( + layout_element, + coordinate_system=coordinate_system, + infer_list_items=infer_list_items, + source_format=source_format if source_format else "html", + ) + if isinstance(element, list): + for el in element: + if last_modification_date: + el.metadata.last_modified = last_modification_date + el.metadata.page_number = page_number + page_elements.extend(element) + translation_mapping.extend([(layout_element, el) for el in element]) + continue + else: - word += char - text_len += len(text_line) - return characters, words + element.metadata.links = ( + get_links_in_element(links, layout_element.bbox) if links else [] + ) + if last_modification_date: + element.metadata.last_modified = last_modification_date + element.metadata.text_as_html = getattr(layout_element, "text_as_html", None) + element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None) -def map_bbox_and_index(words: list[dict[str, Any]], annot: dict[str, Any]): - """ - Maps a bounding box annotation to the corresponding text and start index within a list of words. + if (isinstance(element, Title) and element.metadata.category_depth is None) and any( + getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements + ): + element.metadata.category_depth = 0 - Args: - words (list[dict[str,Any]]): A list of dictionaries, each containing information about - a word, including its text, bounding box, and start index. - annot (dict[str,Any]): The annotation dictionary to be mapped, which will be updated with - "text" and "start_index" fields. - - Returns: - dict: The updated annotation dictionary with "text" representing the mapped text and - "start_index" representing the start index of the mapped text in the list of words. - """ - if len(words) == 0: - annot["text"] = "" - annot["start_index"] = -1 - return annot - distance_from_bbox_start = np.sqrt( - (annot["bbox"][0] - np.array([word["bbox"][0] for word in words])) ** 2 - + (annot["bbox"][1] - np.array([word["bbox"][1] for word in words])) ** 2, - ) - distance_from_bbox_end = np.sqrt( - (annot["bbox"][2] - np.array([word["bbox"][2] for word in words])) ** 2 - + (annot["bbox"][3] - np.array([word["bbox"][3] for word in words])) ** 2, - ) - closest_start = try_argmin(distance_from_bbox_start) - closest_end = try_argmin(distance_from_bbox_end) - - # NOTE(klaijan) - get the word from closest start only if the end index comes after start index - text = "" - if closest_end >= closest_start: - for _ in range(closest_start, closest_end + 1): - text += " " - text += words[_]["text"] - else: - text = words[closest_start]["text"] + page_elements.append(element) + translation_mapping.append((layout_element, element)) + coordinates = ( + element.metadata.coordinates.points if element.metadata.coordinates else None + ) - annot["text"] = text.strip() - annot["start_index"] = words[closest_start]["start_index"] - return annot + el_image_path = ( + layout_element.image_path if hasattr(layout_element, "image_path") else None + ) + add_element_metadata( + element, + page_number=page_number, + filetype=image_format, + coordinates=coordinates, + coordinate_system=coordinate_system, + category_depth=element.metadata.category_depth, + image_path=el_image_path, + detection_origin=detection_origin, + languages=languages, + **kwargs, + ) -def try_argmin(array: np.ndarray) -> int: - """ - Attempt to find the index of the minimum value in a NumPy array. + for layout_element, element in translation_mapping: + if hasattr(layout_element, "parent") and layout_element.parent is not None: + element_parent = first( + (el for l_el, el in translation_mapping if l_el is layout_element.parent), + ) + element.metadata.parent_id = element_parent.id + sorted_page_elements = page_elements + if sortable and sort_mode != SORT_MODE_DONT: + sorted_page_elements = sort_page_elements(page_elements, sort_mode) - Args: - array (np.ndarray): The NumPy array in which to find the minimum value's index. + if include_page_breaks and page_number < num_pages + starting_page_number: + sorted_page_elements.append(PageBreak(text="")) + elements.extend(sorted_page_elements) - Returns: - int: The index of the minimum value in the array. If the array is empty or an - IndexError occurs, it returns -1. - """ - try: - return int(np.argmin(array)) - except IndexError: - return -1 + return elements diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 3863f50874..91a3e689f2 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -1,8 +1,15 @@ -from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, Union, cast import numpy as np +from pdfminer.layout import LTChar, LTTextBox +from pdfminer.pdftypes import PDFObjRef from pdfminer.utils import open_filename +from unstructured_inference.inference.elements import Rectangle +from unstructured.documents.coordinates import PixelSpace, PointSpace +from unstructured.documents.elements import CoordinatesMetadata from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( extract_image_objects, @@ -28,21 +35,21 @@ def process_file_with_pdfminer( filename: str = "", dpi: int = 200, -) -> List[List["TextRegion"]]: +) -> tuple[List[List["TextRegion"]], List[List]]: with open_filename(filename, "rb") as fp: fp = cast(BinaryIO, fp) - extracted_layout = process_data_with_pdfminer( + extracted_layout, layouts_links = process_data_with_pdfminer( file=fp, dpi=dpi, ) - return extracted_layout + return extracted_layout, layouts_links @requires_dependencies("unstructured_inference") def process_data_with_pdfminer( file: Optional[Union[bytes, BinaryIO]] = None, dpi: int = 200, -) -> List[List["TextRegion"]]: +) -> tuple[List[List["TextRegion"]], List[List]]: """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the pdf pages using pdf2image""" @@ -52,14 +59,40 @@ def process_data_with_pdfminer( ) layouts = [] + layouts_links = [] # Coefficient to rescale bounding box to be compatible with images coef = dpi / 72 - for page, page_layout in open_pdfminer_pages_generator(file): - height = page_layout.height + for page_number, (page, page_layout) in enumerate(open_pdfminer_pages_generator(file)): + width, height = page_layout.width, page_layout.height text_layout = [] image_layout = [] + annotation_list = [] + coordinate_system = PixelSpace( + width=width, + height=height, + ) + if page.annots: + annotation_list = get_uris(page.annots, height, coordinate_system, page_number) + + annotation_threshold = env_config.PDF_ANNOTATION_THRESHOLD + urls_metadata: list[dict[str, Any]] = [] + for obj in page_layout: + x1, y1, x2, y2 = rect_to_bbox(obj.bbox, height) + bbox = (x1, y1, x2, y2) + + if len(annotation_list) > 0 and isinstance(obj, LTTextBox): + annotations_within_element = check_annotations_within_element( + annotation_list, + bbox, + page_number, + annotation_threshold, + ) + _, words = get_words_from_obj(obj, height) + for annot in annotations_within_element: + urls_metadata.append(map_bbox_and_index(words, annot)) + if hasattr(obj, "get_text"): inner_text_objects = extract_text_objects(obj) for inner_obj in inner_text_objects: @@ -85,6 +118,15 @@ def process_data_with_pdfminer( ) if text_region.bbox is not None and text_region.bbox.area > 0: image_layout.append(text_region) + links = [ + { + "bbox": [x * coef for x in metadata["bbox"]], + "text": metadata["text"], + "url": metadata["uri"], + "start_index": metadata["start_index"], + } + for metadata in urls_metadata + ] clean_text_layout = remove_duplicate_elements( text_layout, env_config.EMBEDDED_TEXT_SAME_REGION_THRESHOLD @@ -101,8 +143,8 @@ def process_data_with_pdfminer( layout = sort_text_regions(layout) layouts.append(layout) - - return layouts + layouts_links.append(links) + return layouts, layouts_links def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class): @@ -306,3 +348,347 @@ def aggregate_embedded_text_by_block( text = " ".join([obj.text for i, obj in enumerate(pdf_objects) if (mask[i] and obj.text)]) return text + + +def get_links_in_element(page_links: list, region: Rectangle) -> list: + + links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links] + results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region]) + links = [ + { + "text": page_links[idx].get("text"), + "url": page_links[idx].get("url"), + "start_index": page_links[idx].get("start_index"), + } + for idx, result in enumerate(results) + if any(result) + ] + + return links + + +def get_uris( + annots: PDFObjRef | list[PDFObjRef], + height: float, + coordinate_system: PixelSpace | PointSpace, + page_number: int, +) -> list[dict[str, Any]]: + """ + Extracts URI annotations from a single or a list of PDF object references on a specific page. + The type of annots (list or not) depends on the pdf formatting. The function detectes the type + of annots and then pass on to get_uris_from_annots function as a list. + + Args: + annots (PDFObjRef | list[PDFObjRef]): A single or a list of PDF object references + representing annotations on the page. + height (float): The height of the page in the specified coordinate system. + coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + list[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ + if isinstance(annots, list): + return get_uris_from_annots(annots, height, coordinate_system, page_number) + resolved_annots = annots.resolve() + if resolved_annots is None: + return [] + return get_uris_from_annots(resolved_annots, height, coordinate_system, page_number) + + +def get_uris_from_annots( + annots: list[PDFObjRef], + height: int | float, + coordinate_system: PixelSpace | PointSpace, + page_number: int, +) -> list[dict[str, Any]]: + """ + Extracts URI annotations from a list of PDF object references. + + Args: + annots (list[PDFObjRef]): A list of PDF object references representing annotations on + a page. + height (int | float): The height of the page in the specified coordinate system. + coordinate_system (PixelSpace | PointSpace): The coordinate system used to represent + the annotations' coordinates. + page_number (int): The page number from which to extract annotations. + + Returns: + list[dict]: A list of dictionaries, each containing information about a URI annotation, + including its coordinates, bounding box, type, URI link, and page number. + """ + annotation_list = [] + for annotation in annots: + # Check annotation is valid for extraction + annotation_dict = try_resolve(annotation) + if not isinstance(annotation_dict, dict): + continue + subtype = annotation_dict.get("Subtype", None) + if not subtype or isinstance(subtype, PDFObjRef) or str(subtype) != "/'Link'": + continue + # Extract bounding box and update coordinates + rect = annotation_dict.get("Rect", None) + if not rect or isinstance(rect, PDFObjRef) or len(rect) != 4: + continue + x1, y1, x2, y2 = rect_to_bbox(rect, height) + points = ((x1, y1), (x1, y2), (x2, y2), (x2, y1)) + coordinates_metadata = CoordinatesMetadata( + points=points, + system=coordinate_system, + ) + # Extract type + if "A" not in annotation_dict: + continue + uri_dict = try_resolve(annotation_dict["A"]) + if not isinstance(uri_dict, dict): + continue + uri_type = None + if "S" in uri_dict and not isinstance(uri_dict["S"], PDFObjRef): + uri_type = str(uri_dict["S"]) + # Extract URI link + uri = None + try: + if uri_type == "/'URI'": + uri = try_resolve(try_resolve(uri_dict["URI"])).decode("utf-8") + if uri_type == "/'GoTo'": + uri = try_resolve(try_resolve(uri_dict["D"])).decode("utf-8") + except Exception: + pass + + annotation_list.append( + { + "coordinates": coordinates_metadata, + "bbox": (x1, y1, x2, y2), + "type": uri_type, + "uri": uri, + "page_number": page_number, + }, + ) + return annotation_list + + +def try_resolve(annot: PDFObjRef): + """ + Attempt to resolve a PDF object reference. If successful, returns the resolved object; + otherwise, returns the original reference. + """ + try: + return annot.resolve() + except Exception: + return annot + + +def check_annotations_within_element( + annotation_list: list[dict[str, Any]], + element_bbox: tuple[float, float, float, float], + page_number: int, + annotation_threshold: float, +) -> list[dict[str, Any]]: + """ + Filter annotations that are within or highly overlap with a specified element on a page. + + Args: + annotation_list (list[dict[str,Any]]): A list of dictionaries, each containing information + about an annotation. + element_bbox (tuple[float, float, float, float]): The bounding box coordinates of the + specified element in the bbox format (x1, y1, x2, y2). + page_number (int): The page number to which the annotations and element belong. + annotation_threshold (float, optional): The threshold value (between 0.0 and 1.0) + that determines the minimum overlap required for an annotation to be considered + within the element. Default is 0.9. + + Returns: + list[dict[str,Any]]: A list of dictionaries containing information about annotations + that are within or highly overlap with the specified element on the given page, based on + the specified threshold. + """ + annotations_within_element = [] + for annotation in annotation_list: + if annotation["page_number"] == page_number: + annotation_bbox_size = calculate_bbox_area(annotation["bbox"]) + if annotation_bbox_size and ( + calculate_intersection_area(element_bbox, annotation["bbox"]) / annotation_bbox_size + > annotation_threshold + ): + annotations_within_element.append(annotation) + return annotations_within_element + + +def get_words_from_obj( + obj: LTTextBox, + height: float, +) -> tuple[list[LTChar], list[dict[str, Any]]]: + """ + Extracts characters and word bounding boxes from a PDF text element. + + Args: + obj (LTTextBox): The PDF text element from which to extract characters and words. + height (float): The height of the page in the specified coordinate system. + + Returns: + tuple[list[LTChar], list[dict[str,Any]]]: A tuple containing two lists: + - list[LTChar]: A list of LTChar objects representing individual characters. + - list[dict[str,Any]]]: A list of dictionaries, each containing information about + a word, including its text, bounding box, and start index in the element's text. + """ + characters = [] + words = [] + text_len = 0 + + for text_line in obj: + word = "" + x1, y1, x2, y2 = None, None, None, None + start_index = 0 + for index, character in enumerate(text_line): + if isinstance(character, LTChar): + characters.append(character) + char = character.get_text() + + if word and not char.strip(): + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + continue + + # TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9 + # will need to switch to some pattern matching once we support more languages + if not word: + isalnum = char.isalnum() + if word and char.isalnum() != isalnum: + isalnum = char.isalnum() + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + + if len(word) == 0: + start_index = text_len + index + x1 = character.x0 + y2 = height - character.y0 + x2 = character.x1 + y1 = height - character.y1 + else: + x2 = character.x1 + y2 = height - character.y0 + + word += char + else: + words.append( + {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, + ) + word = "" + text_len += len(text_line) + return characters, words + + +def map_bbox_and_index(words: list[dict[str, Any]], annot: dict[str, Any]): + """ + Maps a bounding box annotation to the corresponding text and start index within a list of words. + + Args: + words (list[dict[str,Any]]): A list of dictionaries, each containing information about + a word, including its text, bounding box, and start index. + annot (dict[str,Any]): The annotation dictionary to be mapped, which will be updated with + "text" and "start_index" fields. + + Returns: + dict: The updated annotation dictionary with "text" representing the mapped text and + "start_index" representing the start index of the mapped text in the list of words. + """ + if len(words) == 0: + annot["text"] = "" + annot["start_index"] = -1 + return annot + distance_from_bbox_start = np.sqrt( + (annot["bbox"][0] - np.array([word["bbox"][0] for word in words])) ** 2 + + (annot["bbox"][1] - np.array([word["bbox"][1] for word in words])) ** 2, + ) + distance_from_bbox_end = np.sqrt( + (annot["bbox"][2] - np.array([word["bbox"][2] for word in words])) ** 2 + + (annot["bbox"][3] - np.array([word["bbox"][3] for word in words])) ** 2, + ) + closest_start = try_argmin(distance_from_bbox_start) + closest_end = try_argmin(distance_from_bbox_end) + + # NOTE(klaijan) - get the word from closest start only if the end index comes after start index + text = "" + if closest_end >= closest_start: + for _ in range(closest_start, closest_end + 1): + text += " " + text += words[_]["text"] + else: + text = words[closest_start]["text"] + + annot["text"] = text.strip() + annot["start_index"] = words[closest_start]["start_index"] + return annot + + +def calculate_intersection_area( + bbox1: tuple[float, float, float, float], + bbox2: tuple[float, float, float, float], +) -> float: + """ + Calculate the area of intersection between two bounding boxes. + + Args: + bbox1 (tuple[float, float, float, float]): The coordinates of the first bounding box + in the format (x1, y1, x2, y2). + bbox2 (tuple[float, float, float, float]): The coordinates of the second bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of intersection between the two bounding boxes. If there is no + intersection, the function returns 0.0. + """ + x1_1, y1_1, x2_1, y2_1 = bbox1 + x1_2, y1_2, x2_2, y2_2 = bbox2 + + x_intersection = max(x1_1, x1_2) + y_intersection = max(y1_1, y1_2) + x2_intersection = min(x2_1, x2_2) + y2_intersection = min(y2_1, y2_2) + + if x_intersection < x2_intersection and y_intersection < y2_intersection: + intersection_area = calculate_bbox_area( + (x_intersection, y_intersection, x2_intersection, y2_intersection), + ) + return intersection_area + else: + return 0.0 + + +def calculate_bbox_area(bbox: tuple[float, float, float, float]) -> float: + """ + Calculate the area of a bounding box. + + Args: + bbox (tuple[float, float, float, float]): The coordinates of the bounding box + in the format (x1, y1, x2, y2). + + Returns: + float: The area of the bounding box, computed as the product of its width and height. + """ + x1, y1, x2, y2 = bbox + area = (x2 - x1) * (y2 - y1) + return area + + +def try_argmin(array: np.ndarray) -> int: + """ + Attempt to find the index of the minimum value in a NumPy array. + + Args: + array (np.ndarray): The NumPy array in which to find the minimum value's index. + + Returns: + int: The index of the minimum value in the array. If the array is empty or an + IndexError occurs, it returns -1. + """ + try: + return int(np.argmin(array)) + except IndexError: + return -1