Merge branch 'main' into ryan/ci-version-bump-only

Unstructured-IO · Sep 27, 2023 · e7c2632 · e7c2632
2 parents 00d4b39 + 55315cf
commit e7c2632
Show file tree

Hide file tree

Showing 206 changed files with 7,287 additions and 2,788 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -232,7 +232,7 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.8","3.9","3.10","3.11"]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-m
     env:
       NLTK_DATA: ${{ github.workspace }}/nltk_data
     needs: [setup, lint]
@@ -290,6 +290,8 @@ jobs:
         NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
+        AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
         TABLE_OCR: "tesseract"
         ENTIRE_PAGE_OCR: "tesseract"
       run: |

diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml
@@ -86,6 +86,8 @@ jobs:
           NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
+          AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
           TABLE_OCR: "tesseract"
           ENTIRE_PAGE_OCR: "tesseract"
           OVERWRITE_FIXTURES: "true"

diff --git a/.github/workflows/release-version-alert.yml b/.github/workflows/release-version-alert.yml
@@ -2,49 +2,77 @@ name: Release Version Alert
 
 on:
   pull_request:
-    branches: [main]
+    types:
+      - opened
+      - synchronize
+    branches: [ main ]
 
 jobs:
-  check-version:
+  check-version:  
     runs-on: ubuntu-latest
-
     steps:
       - name: Checkout codes
-        uses: actions/checkout@v4
-
-      - name: Check for __version__.py changes
-        id: version-changes
-        run: |
-          git fetch
-          CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }})
-          echo "CHANGED_FILES=$CHANGED_FILES"
-          if echo "$CHANGED_FILES" | grep '__version__.py'; then
-            echo "Detected __version__.py changes. Proceeding..."
-          else
-            echo "No __version__.py changes detected. Skipping workflow..."
-            echo "SKIP_STEPS=true" >> $GITHUB_ENV  # Set an environment variable to indicate skipping steps
-          fi
-
+        uses: actions/checkout@v4 
       - name: Get PR information
         id: pr-info
         run: |
           PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")
+          HAS_PR=false; [ "$PR_NUMBER" != "null" ] && HAS_PR=true
           echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
+          echo "HAS_PR=$HAS_PR" >> $GITHUB_ENV
+          echo "PR_NUMBER=$PR_NUMBER"
+          echo "HAS_PR=$HAS_PR"
       - name: Check versions
         id: check-versions
-        if: env.SKIP_STEPS != 'true'
         run: |
           CHECK_NEW_VERSION_RESPONSE=$(bash scripts/check-new-release-version.sh)
           if [[ "$CHECK_NEW_VERSION_RESPONSE" == "New release version"* ]]; then
-            echo "Sending Slack notification..."
-            MESSAGE="$CHECK_NEW_VERSION_RESPONSE :rocket: Coming soon in PR: https://github.com/$GITHUB_REPOSITORY/pull/$PR_NUMBER "
+            if [ "$HAS_PR" = true ]; then
+              MESSAGE="$CHECK_NEW_VERSION_RESPONSE :rocket: Coming soon in PR: https://github.com/$GITHUB_REPOSITORY/pull/$PR_NUMBER "
+            else
+              BRANCH_NAME=$(echo "${GITHUB_REF#refs/heads/}")
+              BRANCH_LINK="https://github.com/${{ github.repository }}/tree/$BRANCH_NAME"
+              MESSAGE="$CHECK_NEW_VERSION_RESPONSE :rocket: Coming soon in branch: $BRANCH_LINK"
+            fi
             echo "SLACK_MESSAGE=$MESSAGE" >> $GITHUB_ENV
           else
             echo "No new non-dev version found. Skipping Slack notification."
             echo "SKIP_STEPS=true" >> $GITHUB_ENV  # Set an environment variable to indicate skipping steps
           fi 
-
+          echo "SLACK_MESSAGE=$MESSAGE"
+      - name: Generate Message Hash
+        if: env.SKIP_STEPS != 'true' 
+        id: generate-hash
+        run: |
+          MESSAGE_HASH=$(echo "${{env.SLACK_MESSAGE}}" | sha256sum | cut -d ' ' -f1)
+          echo "MESSAGE_HASH=$MESSAGE_HASH" >> $GITHUB_ENV
+      - name: Restore Message from Cache
+        if: env.SKIP_STEPS != 'true'
+        id: restore-cache
+        uses: actions/cache/restore@v3
+        with: 
+          path: message_cache.txt
+          key: message-cache-${{ env.MESSAGE_HASH  }}
+      - name: Check for Duplicates
+        if: env.SKIP_STEPS != 'true'
+        run: |
+          DUPLICATE_CHECK=$(grep -Fx "${{env.SLACK_MESSAGE}}" message_cache.txt || true)
+          echo "DUPLICATE_CHECK=$DUPLICATE_CHECK"
+          if [ -n "$DUPLICATE_CHECK" ]; then
+            echo "Message already posted. Skipping duplicate Slack notification."
+            echo "SKIP_STEPS=true" >> $GITHUB_ENV  # Set an environment variable to indicate skipping steps
+          fi
+      - name: Write Message to Cache File
+        if: env.SKIP_STEPS != 'true'
+        run: |
+          echo "${{env.SLACK_MESSAGE}}" >> message_cache.txt
+          cat message_cache.txt
+      - name: Store Message in Cache
+        if: env.SKIP_STEPS != 'true'
+        uses: actions/cache/save@v3
+        with:
+          path: message_cache.txt
+          key: message-cache-${{ env.MESSAGE_HASH }}
       - name: Slack Notification
         if: env.SKIP_STEPS != 'true'
         uses: slackapi/[email protected]

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,15 @@
-## 0.10.17-dev3
+## 0.10.17-dev10
 
 ### Enhancements
 
 * **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
+* **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
+* **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI.  Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index.
+* **Improves salesforce partitioning** Partitions Salesforce data as xlm instead of text for improved detail and flexibility. Partitions htmlbody instead of textbody for Salesforce emails. Importance: Allows all Salesforce fields to be ingested and gives Salesforce emails more detailed partitioning.
+* **Add document level language detection functionality.** Introduces the "auto" default for the languages param, which then detects the languages present in the document using the `langdetect` package. Adds the document languages as ISO 639-3 codes to the element metadata. Implemented only for the partition_text function to start.
+* **PPTX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except that shapes enclosed in a group-shape are now included, as many levels deep as required (a group-shape can itself contain a group-shape).
+* **Embeddings support for the SharePoint SourceConnector via unstructured-ingest CLI** The SharePoint connector can now optionally create embeddings from the elements it pulls out during partition and upload those embeddings to Azure Cognitive Search index.
+* **Improves hierarchy from docx files by leveraging natural hierarchies built into docx documents**  Hierarchy can now be detected from an indentation level for list bullets/numbers and by style name (e.g. Heading 1, List Bullet 2, List Number).  
 
 ### Features
 
@@ -11,6 +18,14 @@
 ### Fixes
 
 * **Fixes a metadata source serialization bug** Problem: In unstructured elements, when loading an elements json file from the disk, the data_source attribute is assumed to be an instance of DataSourceMetadata and the code acts based on that. However the loader did not satisfy the assumption, and loaded it as a dict instead, causing an error. Fix: Added necessary code block to initialize a DataSourceMetadata object, also refactored DataSourceMetadata.from_dict() method to remove redundant code. Importance: Crucial to be able to load elements (which have data_source fields) from json files.
+* **Fixes issue where unstructured-inference was not getting updated** Problem: unstructured-inference was not getting upgraded to the version to match unstructured release when doing a pip install.  Solution: using `pip install unstructured[all-docs]` it will now upgrade both unstructured and unstructured-inference. Importance: This will ensure that the inference library is always in sync with the unstructured library, otherwise users will be using outdated libraries which will likely lead to unintended behavior.
+* **Fixes SharePoint connector failures if any document has an unsupported filetype** Problem: Currently the entire connector ingest run fails if a single IngestDoc has an unsupported filetype. This is because a ValueError is raised in the IngestDoc's `__post_init__`. Fix: Adds a try/catch when the IngestConnector runs get_ingest_docs such that the error is logged but all processable documents->IngestDocs are still instantiated and returned. Importance: Allows users to ingest SharePoint content even when some files with unsupported filetypes exist there.
+* **Fixes Sharepoint connector server_path issue** Problem: Server path for the Sharepoint Ingest Doc was incorrectly formatted, causing issues while fetching pages from the remote source. Fix: changes formatting of remote file path before instantiating SharepointIngestDocs and appends a '/' while fetching pages from the remote source. Importance: Allows users to fetch pages from Sharepoint Sites.
+* **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class
+should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class 
+allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas.
+* **Fixes Sphinx errors.** Fixes errors when running Sphinx `make html` and installs library to suppress warnings.
+
 
 ## 0.10.16
 
@@ -19,6 +34,7 @@
 * **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
 * **DOCX partitioner refactored in preparation for enhancement.** Behavior should be unchanged except in multi-section documents containing different headers/footers for different sections. These will now emit all distinct headers and footers encountered instead of just those for the last section.
 * **Add a function to map between Tesseract and standard language codes.** This allows users to input language information to the `languages` param in any Tesseract-supported langcode or any ISO 639 standard language code.
+* **Add document level language detection functionality.** Introduces the "auto" default for the languages param, which then detects the languages present in the document using the `langdetect` package. Implemented only for the partition_text function to start.
 
 ### Features
 
@@ -28,6 +44,7 @@
 
 ## 0.10.15
 
+
 ### Enhancements
 
 * **Support for better element categories from the next-generation image-to-text model ("chipper").** Previously, not all of the classifications from Chipper were being mapped to proper `unstructured` element categories so the consumer of the library would see many `UncategorizedText` elements. This fixes the issue, improving the granularity of the element categories outputs for better downstream processing and chunking. The mapping update is:

diff --git a/Makefile b/Makefile
@@ -221,14 +221,7 @@ install-paddleocr:
 ## pip-compile:             compiles all base/dev/test requirements
 .PHONY: pip-compile
 pip-compile:
-	@for file in $(shell ls requirements/*.in); do \
-		if [[ "$${file}" =~ "constraints" ]]; then \
-			continue; \
-		fi; \
-		echo "running: pip-compile --upgrade $${file}"; \
-		pip-compile --upgrade $${file}; \
-	done
-	cp requirements/build.txt docs/requirements.txt
+	@scripts/pip-compile.sh
 
 
 

diff --git a/README.md b/README.md
@@ -68,14 +68,14 @@ NOTE: we build multi-platform images to support both x86_64 and Apple silicon ha
 We build Docker images for all pushes to `main`. We tag each image with the corresponding short commit hash (e.g. `fbc7a69`) and the application version (e.g. `0.5.5-dev1`). We also tag the most recent image with `latest`. To leverage this, `docker pull` from our image repository.
 
 ```bash
-docker pull quay.io/unstructured-io/unstructured:latest
+docker pull downloads.unstructured.io/unstructured-io/unstructured:latest
 ```
 
 Once pulled, you can create a container from this image and shell to it.
 
 ```bash
 # create the container
-docker run -dt --name unstructured quay.io/unstructured-io/unstructured:latest
+docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/unstructured:latest
 
 # this will drop you into a bash shell where the Docker image is running
 docker exec -it unstructured bash
@@ -253,3 +253,7 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc
 | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
 | [Documentation](https://unstructured-io.github.io/unstructured) | Full API documentation |
 | [Batch Processing](unstructured/ingest/README.md) | Ingesting batches of documents through Unstructured |
+
+## :chart_with_upwards_trend: Analytics
+
+We’ve partnered with Scarf (https://scarf.sh) to collect anonymized user statistics to understand which features our community is using and how to prioritize product decision-making in the future. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy).
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -24,6 +24,7 @@ charset-normalizer==3.2.0
     #   requests
 docutils==0.18.1
     # via
+    #   myst-parser
     #   sphinx
     #   sphinx-rtd-theme
     #   sphinx-tabs
@@ -38,9 +39,21 @@ imagesize==1.4.1
 importlib-metadata==6.8.0
     # via sphinx
 jinja2==3.1.2
-    # via sphinx
+    # via
+    #   myst-parser
+    #   sphinx
+markdown-it-py==3.0.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
 markupsafe==2.1.3
     # via jinja2
+mdit-py-plugins==0.4.0
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-parser==2.0.0
+    # via -r requirements/build.in
 packaging==23.1
     # via
     #   -c requirements/base.txt
@@ -52,6 +65,8 @@ pygments==2.16.1
     #   sphinx-tabs
 pytz==2023.3.post1
     # via babel
+pyyaml==6.0.1
+    # via myst-parser
 requests==2.31.0
     # via
     #   -c requirements/base.txt
@@ -66,6 +81,7 @@ sphinx==6.2.1
     # via
     #   -r requirements/build.in
     #   furo
+    #   myst-parser
     #   sphinx-basic-ng
     #   sphinx-rtd-theme
     #   sphinx-tabs
@@ -105,5 +121,5 @@ urllib3==1.26.16
     #   -c requirements/base.txt
     #   -c requirements/constraints.in
     #   requests
-zipp==3.16.2
+zipp==3.17.0
     # via importlib-metadata
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -1,7 +1,7 @@
 Unstructured API
 #################
 
-Try our hosted API! It's freely available to use with any of the file types listed above. This is the easiest way to get started, all you need is an API key. You can get your API key `here <https://unstructured.io/#get-api-key>`_ now and start using it today.
+Try our hosted API! It's freely available to use with any of the file types listed above. This is the easiest way to get started, all you need is an API key. You can get your API key `here <https://unstructured.io/#get-api-key>`__ now and start using it today.
 
 Now you can get started with this quick example:
 
@@ -45,9 +45,9 @@ Now you can get started with this quick example:
 
         json_response = response.json()
 
-Below, you will find a more comprehensive overview of the API capabilities. For detailed information on request and response schemas, refer to the `API documentation <https://api.unstructured.io/general/docs#/>`_.
+Below, you will find a more comprehensive overview of the API capabilities. For detailed information on request and response schemas, refer to the `API documentation <https://api.unstructured.io/general/docs#/>`__.
 
-NOTE: You can also host the API locally. For more information check the `Using the API Locally`_ section.
+NOTE: You can also host the API locally. For more information check the `Using the API Locally <https://github.com/Unstructured-IO/unstructured-api>`__ section.
 
 
 Supported File Types

diff --git a/docs/source/bricks/cleaning.rst b/docs/source/bricks/cleaning.rst
@@ -357,7 +357,7 @@ Examples:
   # Returns ""‘A lovely quote!’"
   replace_unicode_characters("\x91A lovely quote!\x92")
 
-For more information about the ``replace_unicode_quotes`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/cleaners/core.py>`_.
+For more information about the ``replace_unicode_quotes`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/cleaners/core.py>`__.
 
 
 ``translate_text``
@@ -390,4 +390,4 @@ Examples:
   # Output is "I can also translate Russian!"
   translate_text("Я тоже можно переводать русский язык!", "ru", "en")
 
-For more information about the ``translate_text`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/cleaners/translate.py>`_.
+For more information about the ``translate_text`` brick, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/cleaners/translate.py>`__.