From 1f6a452dfa327ba8df79551d31bbd5f1f065640d Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Tue, 16 Jul 2024 13:52:45 -0400 Subject: [PATCH 1/3] Save file id for all fsspec connectors if present --- unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py index 3a1a76ca9b..2adfa99b03 100644 --- a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py +++ b/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py @@ -187,6 +187,9 @@ def get_metadata(self, path: str) -> DataSourceMetadata: "protocol": self.index_config.protocol, "remote_file_path": self.index_config.remote_url, } + file_stat = self.fs.stat(path=path) + if file_id := file_stat.get("id"): + record_locator["file_id"] = file_id if metadata: record_locator["metadata"] = metadata return DataSourceMetadata( From 3fcf26c5a846487fddfad41b7011bb885edf12a2 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 18 Jul 2024 13:28:19 -0400 Subject: [PATCH 2/3] bump changelog --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94a6ff8bc1..92e3b10099 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.0-dev15 +## 0.15.0-dev16 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a9a2902f8e..11dfa89756 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.0-dev15" # pragma: no cover +__version__ = "0.15.0-dev16" # pragma: no cover From 83fada8589a2f21b6780bef9b1ed341272dbe98c Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Fri, 19 Jul 2024 05:16:38 -0700 Subject: [PATCH 3/3] feat: save file id for all fsspec connectors if present <- Ingest test fixtures update (#3419) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rbiseck3 --- .../box/handbook-1p.docx.json | 45 ++++++++++++------- .../box/nested-1/ideas-page.html.json | 3 +- .../nested-1/nested-2/ideas-page.html.json | 3 +- .../box/science-exploration-1p.pptx.json | 39 ++++++++++------ .../dropbox/handbook-1p.docx.json | 45 ++++++++++++------- .../dropbox/nested-1/ideas-page.html.json | 3 +- .../dropbox/nested-2/ideas-page.html.json | 3 +- .../dropbox/science-exploration-1p.pptx.json | 39 ++++++++++------ .../gcs/ideas-page.html.json | 3 +- .../gcs/nested-1/fake-text.txt.json | 18 +++++--- .../gcs/nested-1/nested/ideas-page.html.json | 3 +- .../gcs/nested-2/fake-text.txt.json | 18 +++++--- .../gcs/nested-2/nested/ideas-page.html.json | 3 +- .../gcs/nested-2/stanley-cups.xlsx.json | 12 +++-- 14 files changed, 158 insertions(+), 79 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json index 289d45c63f..39646d9a76 100644 --- a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json @@ -14,7 +14,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -41,7 +42,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -68,7 +70,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -89,7 +92,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -110,7 +114,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -131,7 +136,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -152,7 +158,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -173,7 +180,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -194,7 +202,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -215,7 +224,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -236,7 +246,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -257,7 +268,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -278,7 +290,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -299,7 +312,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" @@ -321,7 +335,8 @@ "version": "83125548004193369404829885052395764226", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255888824139" }, "date_created": "1688874451.0", "date_modified": "1688874451.0" diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json index e6928373be..93b73c6c45 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "77943175838335685751163845636763163681", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255892530552" }, "date_created": "1688874401.0", "date_modified": "1688874401.0" diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json index 8184904b6e..108a93d61d 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "293680985726204769765169474511274942733", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255884723846" }, "date_created": "1688874389.0", "date_modified": "1688874389.0" diff --git a/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json index 6534f21834..23a1ddae76 100644 --- a/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json +++ b/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json @@ -14,7 +14,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -36,7 +37,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -58,7 +60,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -80,7 +83,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -102,7 +106,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -124,7 +129,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -146,7 +152,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -168,7 +175,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -190,7 +198,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -212,7 +221,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -234,7 +244,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -256,7 +267,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" @@ -278,7 +290,8 @@ "version": "309546934335254463247992132065898582121", "record_locator": { "protocol": "box", - "remote_file_path": "box://utic-test-ingest-fixtures" + "remote_file_path": "box://utic-test-ingest-fixtures", + "file_id": "1255894255490" }, "date_created": "1688874428.0", "date_modified": "1688874428.0" diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json index 8607d1e336..94e1c93f42 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json @@ -14,7 +14,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -39,7 +40,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -64,7 +66,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -83,7 +86,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -102,7 +106,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -121,7 +126,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -140,7 +146,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -159,7 +166,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -178,7 +186,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -197,7 +206,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -216,7 +226,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -235,7 +246,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -254,7 +266,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -273,7 +286,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } @@ -293,7 +307,8 @@ "version": "134700592086487568162605251521926324397", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json index b6c29b1f92..62d85dcb85 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "67356979305728150851855820427694668063", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACw" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json index bc738ae7de..47b76c535d 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "145453788782335405288844961545898675998", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAADQ" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json index 0c85135a8a..c5a44f1582 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json @@ -14,7 +14,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -34,7 +35,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -54,7 +56,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -74,7 +77,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -94,7 +98,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -114,7 +119,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -134,7 +140,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -154,7 +161,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -174,7 +182,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -194,7 +203,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -214,7 +224,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -234,7 +245,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } @@ -254,7 +266,8 @@ "version": "26035320120182381452247268381589958225", "record_locator": { "protocol": "dropbox", - "remote_file_path": "dropbox://test-input/" + "remote_file_path": "dropbox://test-input/", + "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" } } } diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json index cb37c514ef..9151850e0f 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "199523943725186047835150971481714294476", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/ideas-page.html/1687304971036821" }, "date_created": "1687304971.038", "date_modified": "1687304971.038" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json index 552ecceb02..d49564e20f 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json @@ -13,7 +13,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" @@ -34,7 +35,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" @@ -55,7 +57,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" @@ -76,7 +79,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" @@ -97,7 +101,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" @@ -118,7 +123,8 @@ "version": "180263070579038859328651626981788275889", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/fake-text.txt/1687304893301804" }, "date_created": "1687304893.303", "date_modified": "1687304893.303" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json index ac7b27b970..98fa456dbd 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "310890354306462681752199911957569001015", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-1/nested/ideas-page.html/1687304893748677" }, "date_created": "1687304893.75", "date_modified": "1687304893.75" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json index 215071df1e..7f5a3c007b 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json @@ -13,7 +13,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" @@ -34,7 +35,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" @@ -55,7 +57,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" @@ -76,7 +79,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" @@ -97,7 +101,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" @@ -118,7 +123,8 @@ "version": "198731266903969902154134165613731741332", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/fake-text.txt/1687304904189941" }, "date_created": "1687304904.192", "date_modified": "1687304904.192" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json index 6dc079de40..e7691d8897 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -14,7 +14,8 @@ "version": "113813498010717860141768546590661839404", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/nested/ideas-page.html/1687304904584421" }, "date_created": "1687304904.586", "date_modified": "1687304904.586" diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json index 792fbbe991..c7a6b9d3be 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -15,7 +15,8 @@ "version": "25646232132200560657189097157576319365", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx/1687304904970987" }, "date_created": "1687304904.973", "date_modified": "1687304904.973" @@ -39,7 +40,8 @@ "version": "25646232132200560657189097157576319365", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx/1687304904970987" }, "date_created": "1687304904.973", "date_modified": "1687304904.973" @@ -62,7 +64,8 @@ "version": "25646232132200560657189097157576319365", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx/1687304904970987" }, "date_created": "1687304904.973", "date_modified": "1687304904.973" @@ -86,7 +89,8 @@ "version": "25646232132200560657189097157576319365", "record_locator": { "protocol": "gs", - "remote_file_path": "gs://utic-test-ingest-fixtures/" + "remote_file_path": "gs://utic-test-ingest-fixtures/", + "file_id": "utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx/1687304904970987" }, "date_created": "1687304904.973", "date_modified": "1687304904.973"