From da93da0d4b8f35c98b812f1b2c9051b63b9a98c7 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:41:21 -0400 Subject: [PATCH 1/4] remove Office365-REST-Python-Client constraint pin --- requirements/deps/constraints.txt | 3 --- requirements/extra-pdf-image.txt | 2 +- requirements/extra-pptx.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 2 +- requirements/ingest/embed-huggingface.txt | 2 +- requirements/ingest/embed-openai.txt | 2 +- requirements/ingest/embed-vertexai.txt | 2 +- requirements/ingest/embed-voyageai.txt | 2 +- requirements/ingest/onedrive.txt | 10 ++++++---- requirements/ingest/outlook.txt | 10 ++++++---- requirements/ingest/sharepoint.txt | 10 ++++++---- requirements/ingest/singlestore.txt | 2 +- 12 files changed, 26 insertions(+), 23 deletions(-) diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 7da1129d1d..4d7d1c5ba7 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -15,9 +15,6 @@ certifi>=2023.7.22 pyparsing<3.1.0 scipy<1.11.4 IPython<8.13 -# NOTE(alan) Pinned to avoid error that occurs with 2.4.3: -# AttributeError: 'ResourcePath' object has no attribute 'collection' -Office365-REST-Python-Client<2.4.3 # NOTE(trevor) `unstructured-inference` is set in extra-pdf-image.in to allow # unstructured-inference to be upgraded when unstructured library is upgraded # https://github.com/Unstructured-IO/unstructured/issues/1458 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index c57f23bd7b..d63001e7fe 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -149,7 +149,7 @@ pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.11.2 +pdfplumber==0.11.3 # via layoutparser pikepdf==9.1.0 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 6b8c80a834..42a06959e7 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -8,7 +8,7 @@ lxml==5.2.2 # via python-pptx pillow==10.4.0 # via python-pptx -python-pptx==1.0.1 +python-pptx==1.0.2 # via -r ./extra-pptx.in typing-extensions==4.12.2 # via python-pptx diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 2c97a997d1..3f3fe1bd05 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -63,7 +63,7 @@ langchain-community==0.2.11 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-aws-bedrock.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 4c447f6ef4..24a97bd2ec 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -45,7 +45,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.28 +langchain-core==0.2.29 # via langchain-huggingface langchain-huggingface==0.0.3 # via -r ./ingest/embed-huggingface.in diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 91beb70f6d..4a556148c6 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -53,7 +53,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.28 +langchain-core==0.2.29 # via langchain-openai langchain-openai==0.1.20 # via -r ./ingest/embed-openai.in diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index 035d0beb29..6574e8183f 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -110,7 +110,7 @@ langchain-community==0.2.11 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-vertexai.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 6db8f82e0c..e4d8ecab99 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -46,7 +46,7 @@ jsonpointer==3.0.0 # via jsonpatch langchain==0.2.12 # via -r ./ingest/embed-voyageai.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-text-splitters diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index 0fd9b13df1..b0358255f1 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -33,10 +33,8 @@ msal==1.30.0 # via # -r ./ingest/onedrive.in # office365-rest-python-client -office365-rest-python-client==2.4.2 - # via - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/onedrive.in +office365-rest-python-client==2.5.11 + # via -r ./ingest/onedrive.in pycparser==2.22 # via cffi pyjwt[crypto]==2.9.0 @@ -52,6 +50,10 @@ soupsieve==2.5 # via # -c ./ingest/../base.txt # beautifulsoup4 +typing-extensions==4.12.2 + # via + # -c ./ingest/../base.txt + # office365-rest-python-client urllib3==1.26.19 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index dace922e85..473e026252 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -27,10 +27,8 @@ msal==1.30.0 # via # -r ./ingest/outlook.in # office365-rest-python-client -office365-rest-python-client==2.4.2 - # via - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/outlook.in +office365-rest-python-client==2.5.11 + # via -r ./ingest/outlook.in pycparser==2.22 # via cffi pyjwt[crypto]==2.9.0 @@ -42,6 +40,10 @@ requests==2.32.3 # -c ./ingest/../base.txt # msal # office365-rest-python-client +typing-extensions==4.12.2 + # via + # -c ./ingest/../base.txt + # office365-rest-python-client urllib3==1.26.19 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index 8680b35cbe..3c84d0de9a 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -27,10 +27,8 @@ msal==1.30.0 # via # -r ./ingest/sharepoint.in # office365-rest-python-client -office365-rest-python-client==2.4.2 - # via - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/sharepoint.in +office365-rest-python-client==2.5.11 + # via -r ./ingest/sharepoint.in pycparser==2.22 # via cffi pyjwt[crypto]==2.9.0 @@ -42,6 +40,10 @@ requests==2.32.3 # -c ./ingest/../base.txt # msal # office365-rest-python-client +typing-extensions==4.12.2 + # via + # -c ./ingest/../base.txt + # office365-rest-python-client urllib3==1.26.19 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index b3ea506061..2a6d094a6a 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -40,7 +40,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # singlestoredb -singlestoredb==1.6.1 +singlestoredb==1.6.2 # via -r ./ingest/singlestore.in sqlparams==6.0.1 # via singlestoredb From a62d1878703c36dbe1a941b93fcfc416186fcbdc Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:51:26 -0400 Subject: [PATCH 2/4] bump changelog and version --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 658f4d23f4..3c63314d1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.2-dev3 +## 0.15.2-dev4 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 24b428a1eb..fac8b4352a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.2-dev3" # pragma: no cover +__version__ = "0.15.2-dev4" # pragma: no cover From 3e926731178833232212a10b691ef1f9dd790ec9 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:42:25 -0400 Subject: [PATCH 3/4] type checking --- unstructured/ingest/v2/logger.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unstructured/ingest/v2/logger.py b/unstructured/ingest/v2/logger.py index 34c5c1df3d..c565449b16 100644 --- a/unstructured/ingest/v2/logger.py +++ b/unstructured/ingest/v2/logger.py @@ -22,8 +22,8 @@ def default_is_data_sensitive(k: str, v: Any) -> bool: def hide_sensitive_fields( - data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive -) -> dict: + data: dict[str, Any], is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive +) -> dict[str, Any]: """ Will recursively look through every k, v pair in this dict and any nested ones and run is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if @@ -62,8 +62,7 @@ def redact_jsons(s: str) -> str: if "{" not in chars: return s i = 0 - jsons = [] - i = 0 + jsons: list[str] = [] while i < len(chars): char = chars[i] if char == "{": From fe3c5fff9f56784f672e5a5d6dbac68d1f40a4e3 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Fri, 9 Aug 2024 12:26:29 -0400 Subject: [PATCH 4/4] fix datetime parse error --- unstructured/ingest/v2/processes/connectors/onedrive.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unstructured/ingest/v2/processes/connectors/onedrive.py b/unstructured/ingest/v2/processes/connectors/onedrive.py index c873ecfddf..3ab711b6bd 100644 --- a/unstructured/ingest/v2/processes/connectors/onedrive.py +++ b/unstructured/ingest/v2/processes/connectors/onedrive.py @@ -89,10 +89,10 @@ class OnedriveIndexer(Indexer): def list_objects(self, folder, recursive) -> list["DriveItem"]: drive_items = folder.children.get().execute_query() - files = [d for d in drive_items if d.is_file] + files = [d for d in drive_items if d.file is not None] if not recursive: return files - folders = [d for d in drive_items if d.is_folder] + folders = [d for d in drive_items if d.is_folder is not None] for f in folders: files.extend(self.list_objects(f, recursive)) return files @@ -123,12 +123,12 @@ def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData: server_path = file_path + "/" + filename rel_path = server_path.replace(self.index_config.path, "").lstrip("/") date_modified_dt = ( - parser.parse(drive_item.last_modified_datetime) + parser.parse(str(drive_item.last_modified_datetime)) if drive_item.last_modified_datetime else None ) date_created_at = ( - parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None + parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None ) return FileData( identifier=drive_item.id,