From aa5935b357e01285c11d47653f3cecbb21c1c94c Mon Sep 17 00:00:00 2001 From: Marianna Date: Thu, 24 Oct 2024 15:02:34 +0200 Subject: [PATCH] Ml 384/whitespaces in cct (#3747) This ticket ensures that CCT metric will not be sensitive to differences in whitespace (including newline). All whitespaces in string are changed to single space `" "` in both GT and PRED before the metric is computed. Additional changes in CHANGELOG due to auto-formatting. --- CHANGELOG.md | 74 ++++---- .../metrics/test_text_extraction.py | 163 +++++++++++++++++- unstructured/__version__.py | 2 +- unstructured/metrics/text_extraction.py | 13 +- 4 files changed, 208 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f062d1004..a2cd18b1a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## 0.16.2-dev2 + +<<<<<<< HEAD +======= +### Enhancements + +### Features + +>>>>>>> 81635a89 (resolve conflicts) +### Features + +* **Whitespace-invariant CCT distance metric.** CCT Levenshtein distance for strings is by default computed with standardized whitespaces. + ## 0.16.2-dev1 ### Enhancements @@ -6,7 +19,7 @@ ### Fixes -## **Fixed retry config settings for partition_via_api function** If the SDK's default retry config is not set the retry config getter function does not fail anymore. +* **Fixed retry config settings for partition_via_api function** If the SDK's default retry config is not set the retry config getter function does not fail anymore. ## 0.16.1 @@ -318,7 +331,6 @@ ### Features * **Expose conversion functions for tables** Adds public functions to convert tables from HTML to the Deckerd format and back - * **Adds Kafka Source and Destination** New source and destination connector added to all CLI ingest commands to support reading from and writing to Kafka streams. Also supports Confluent Kafka. ### Fixes @@ -365,7 +377,7 @@ * **Move logger error to debug level when PDFminer fails to extract text** which includes error message for Invalid dictionary construct. * **Add support for Pinecone serverless** Adds Pinecone serverless to the connector tests. Pinecone - serverless will work version versions >=0.14.2, but hadn't been tested until now. + serverless will work version versions >=0.14.2, but hadn't been tested until now. ### Features @@ -448,6 +460,7 @@ * **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage. ### Features + * **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`. ### Fixes @@ -625,8 +638,8 @@ ### Enhancements ### Features -* Add `date_from_file_object` parameter to partition. If True and if file is provided via `file` parameter it will cause partition to infer last modified date from `file`'s content. If False, last modified metadata will be `None`. +* Add `date_from_file_object` parameter to partition. If True and if file is provided via `file` parameter it will cause partition to infer last modified date from `file`'s content. If False, last modified metadata will be `None`. * **Header and footer detection for fast strategy** `partition_pdf` with `fast` strategy now detects elements that are in the top or bottom 5 percent of the page as headers and footers. * **Add parent_element to overlapping case output** Adds parent_element to the output for `identify_overlapping_or_nesting_case` and `catch_overlapping_and_nested_bboxes` functions. @@ -645,7 +658,6 @@ * **Rename `OpenAiEmbeddingConfig` to `OpenAIEmbeddingConfig`.** * **Fix partition_json() doesn't chunk.** The `@add_chunking_strategy` decorator was missing from `partition_json()` such that pre-partitioned documents serialized to JSON did not chunk when a chunking-strategy was specified. - ## 0.12.4 ### Enhancements @@ -674,7 +686,6 @@ * **Add title to Vectara upload - was not separated out from initial connector ** * **Fix change OpenSearch port to fix potential conflict with Elasticsearch in ingest test ** - ## 0.12.3 ### Enhancements @@ -727,6 +738,7 @@ * **Install Kapa AI chatbot.** Added Kapa.ai website widget on the documentation. ### Features + * **MongoDB Source Connector.** New source connector added to all CLI ingest commands to support downloading/partitioning files from MongoDB. * **Add OpenSearch source and destination connectors.** OpenSearch, a fork of Elasticsearch, is a popular storage solution for various functionality such as search, or providing intermediary caches within data pipelines. Feature: Added OpenSearch source connector to support downloading/partitioning files. Added OpenSearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into OpenSearch. @@ -905,8 +917,8 @@ * **Import tables_agent from inference** so that we don't have to initialize a global table agent in unstructured OCR again * **Fix empty table is identified as bulleted-table.** A table with no text content was mistakenly identified as a bulleted-table and processed by the wrong branch of the initial HTML partitioner. * **Fix partition_html() emits empty (no text) tables.** A table with cells nested below a `` or `` element was emitted as a table element having no text and unparseable HTML in `element.metadata.text_as_html`. Do not emit empty tables to the element stream. -* **Fix HTML `element.metadata.text_as_html` contains spurious
elements in invalid locations.** The HTML generated for the `text_as_html` metadata for HTML tables contained `
` elements invalid locations like between `` and ``. Change the HTML generator such that these do not appear. -* **Fix HTML table cells enclosed in and elements are dropped.** HTML table cells nested in a `` or `` element were not detected and the text in those cells was omitted from the table element text and `.text_as_html`. Detect table rows regardless of the semantic tag they may be nested in. +* **Fix HTML `element.metadata.text_as_html` contains spurious `
` elements in invalid locations.** The HTML generated for the `text_as_html` metadata for HTML tables contained `
` elements invalid locations like between `
` and ``. Change the HTML generator such that these do not appear. +* **Fix HTML table cells enclosed in `` and `` elements are dropped.** HTML table cells nested in a `` or `` element were not detected and the text in those cells was omitted from the table element text and `.text_as_html`. Detect table rows regardless of the semantic tag they may be nested in. * **Remove whitespace padding from `.text_as_html`.** `tabulate` inserts padding spaces to achieve visual alignment of columns in HTML tables it generates. Add our own HTML generator to do this simple job and omit that padding as well as newlines ("\n") used for human readability. * **Fix local connector with absolute input path** When passed an absolute filepath for the input document path, the local connector incorrectly writes the output file to the input file directory. This fixes such that the output in this case is written to `output-dir/input-filename.json` @@ -974,8 +986,8 @@ * **Update `ocr_only` strategy in `partition_pdf()`** Adds the functionality to get accurate coordinate data when partitioning PDFs and Images with the `ocr_only` strategy. ### Fixes -* **Fixed SharePoint permissions for the fetching to be opt-in** Problem: Sharepoint permissions were trying to be fetched even when no reletad cli params were provided, and this gave an error due to values for those keys not existing. Fix: Updated getting keys to be with .get() method and changed the "skip-check" to check individual cli params rather than checking the existance of a config object. +* **Fixed SharePoint permissions for the fetching to be opt-in** Problem: Sharepoint permissions were trying to be fetched even when no reletad cli params were provided, and this gave an error due to values for those keys not existing. Fix: Updated getting keys to be with .get() method and changed the "skip-check" to check individual cli params rather than checking the existance of a config object. * **Fixes issue where tables from markdown documents were being treated as text** Problem: Tables from markdown documents were being treated as text, and not being extracted as tables. Solution: Enable the `tables` extension when instantiating the `python-markdown` object. Importance: This will allow users to extract structured data from tables in markdown documents. * **Fix wrong logger for paddle info** Replace the logger from unstructured-inference with the logger from unstructured for paddle_ocr.py module. * **Fix ingest pipeline to be able to use chunking and embedding together** Problem: When ingest pipeline was using chunking and embedding together, embedding outputs were empty and the outputs of chunking couldn't be re-read into memory and be forwarded to embeddings. Fix: Added CompositeElement type to TYPE_TO_TEXT_ELEMENT_MAP to be able to process CompositeElements with unstructured.staging.base.isd_to_elements @@ -1028,7 +1040,7 @@ ### Features * **Table OCR refactor** support Table OCR with pre-computed OCR data to ensure we only do one OCR for entrie document. User can specify -ocr agent tesseract/paddle in environment variable `OCR_AGENT` for OCRing the entire document. + ocr agent tesseract/paddle in environment variable `OCR_AGENT` for OCRing the entire document. * **Adds accuracy function** The accuracy scoring was originally an option under `calculate_edit_distance`. For easy function call, it is now a wrapper around the original function that calls edit_distance and return as "score". * **Adds HuggingFaceEmbeddingEncoder** The HuggingFace Embedding Encoder uses a local embedding model as opposed to using an API. * **Add AWS bedrock embedding connector** `unstructured.embed.bedrock` now provides a connector to use AWS bedrock's `titan-embed-text` model to generate embeddings for elements. This features requires valid AWS bedrock setup and an internet connectionto run. @@ -1059,7 +1071,7 @@ ocr agent tesseract/paddle in environment variable `OCR_AGENT` for OCRing the en ### Fixes * **Fix paddle model file not discoverable** Fixes issue where ocr_models/paddle_ocr.py file is not discoverable on PyPI by adding -an `__init__.py` file under the folder. + an `__init__.py` file under the folder. * **Chipper v2 Fixes** Includes fix for a memory leak and rare last-element bbox fix. (unstructured-inference==0.7.7) * **Fix image resizing issue** Includes fix related to resizing images in the tables pipeline. (unstructured-inference==0.7.6) @@ -1121,12 +1133,13 @@ an `__init__.py` file under the folder. * **Applies `max_characters=` argument to all element types in `add_chunking_strategy` decorator** Previously this argument was only utilized in chunking Table elements and now applies to all partitioned elements if `add_chunking_strategy` decorator is utilized, further preparing the elements for downstream processing. * **Add common retry strategy utilities for unstructured-ingest** Dynamic retry strategy with exponential backoff added to Notion source connector. * + ### Features * **Adds `bag_of_words` and `percent_missing_text` functions** In order to count the word frequencies in two input texts and calculate the percentage of text missing relative to the source document. * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included. * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature -setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. + setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. * **Adds a function that calculates frequency of the element type and its depth** To capture the accuracy of element type extraction, this function counts the occurrences of each unique element type with its depth for use in element metrics. ### Fixes @@ -1136,11 +1149,10 @@ setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. * **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page. * **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate. * **Fixes badly initialized Formula** Problem: YoloX contain new types of elements, when loading a document that contain formulas a new element of that class -should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class -allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. + should be generated, however the Formula class inherits from Element instead of Text. After this change the element is correctly created with the correct class + allowing the document to be loaded. Fix: Change parent class for Formula to Text. Importance: Crucial to be able to load documents that contain formulas. * **Fixes pdf uri error** An error was encountered when URI type of `GoToR` which refers to pdf resources outside of its own was detected since no condition catches such case. The code is fixing the issue by initialize URI before any condition check. - ## 0.10.19 ### Enhancements @@ -1149,7 +1161,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. -* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. +* **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length `` characters. This means partitioned Table results are ready for use in downstream applications without any post processing. * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio). ### Features @@ -1217,7 +1229,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ## 0.10.15 - ### Enhancements * **Support for better element categories from the next-generation image-to-text model ("chipper").** Previously, not all of the classifications from Chipper were being mapped to proper `unstructured` element categories so the consumer of the library would see many `UncategorizedText` elements. This fixes the issue, improving the granularity of the element categories outputs for better downstream processing and chunking. The mapping update is: @@ -1291,7 +1302,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Add Jira Connector to be able to pull issues from a Jira organization * Add `clean_ligatures` function to expand ligatures in text - ### Fixes * `partition_html` breaks on `
` elements. @@ -1309,14 +1319,12 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Support for yolox_quantized layout detection model (0.5.20) * YoloX element types added - ### Features * Add Salesforce Connector to be able to pull Account, Case, Campaign, EmailMessage, Lead ### Fixes - * Bump unstructured-inference * Avoid divide-by-zero errors swith `safe_division` (0.5.21) @@ -1437,15 +1445,18 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Adds ability to reuse connections per process in unstructured-ingest ### Features + * Add delta table connector ### Fixes ## 0.10.4 + * Pass ocr_mode in partition_pdf and set the default back to individual pages for now * Add diagrams and descriptions for ingest design in the ingest README ### Features + * Supports multipage TIFF image partitioning ### Fixes @@ -1453,6 +1464,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ## 0.10.2 ### Enhancements + * Bump unstructured-inference==0.5.13: - Fix extracted image elements being included in layout merge, addresses the issue where an entire-page image in a PDF was not passed to the layout model when using hi_res. @@ -1464,6 +1476,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ## 0.10.1 ### Enhancements + * Bump unstructured-inference==0.5.12: - fix to avoid trace for certain PDF's (0.5.12) - better defaults for DPI for hi_res and Chipper (0.5.11) @@ -1515,7 +1528,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ## 0.9.2 - ### Enhancements * Update table extraction section in API documentation to sync with change in Prod API @@ -1562,7 +1574,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Skip ingest test on missing Slack token * Add Dropbox variables to CI environments * Remove default encoding for ingest -* Adds new element type `EmailAddress` for recognising email address in the  text +* Adds new element type `EmailAddress` for recognising email address in the text * Simplifies `min_partition` logic; makes partitions falling below the `min_partition` less likely. * Fix bug where ingest test check for number of files fails in smoke test @@ -1694,7 +1706,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Adjust encoding recognition threshold value in `detect_file_encoding` * Fix KeyError when `isd_to_elements` doesn't find a type * Fix `_output_filename` for local connector, allowing single files to be written correctly to the disk - * Fix for cases where an invalid encoding is extracted from an email header. ### BREAKING CHANGES @@ -1706,6 +1717,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ### Enhancements * Adds `include_metadata` kwarg to `partition_doc`, `partition_docx`, `partition_email`, `partition_epub`, `partition_json`, `partition_msg`, `partition_odt`, `partition_org`, `partition_pdf`, `partition_ppt`, `partition_pptx`, `partition_rst`, and `partition_rtf` + ### Features * Add Elasticsearch connector for ingest cli to pull specific fields from all documents in an index. @@ -1940,10 +1952,8 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ### Features - ### Fixes - ## 0.6.10 ### Enhancements @@ -2040,7 +2050,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ### Fixes - ## 0.6.4 ### Enhancements @@ -2077,7 +2086,6 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Added logic to `partition_pdf` for detecting copy protected PDFs and falling back to the hi res strategy when necessary. - ### Features * Add `partition_via_api` for partitioning documents through the hosted API. @@ -2148,8 +2156,8 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text * Added method to utils to allow date time format validation ### Features -* Add Slack connector to pull messages for a specific channel +* Add Slack connector to pull messages for a specific channel * Add --partition-by-api parameter to unstructured-ingest * Added `partition_rtf` for processing rich text files. * `partition` now accepts a `url` kwarg in addition to `file` and `filename`. @@ -2279,7 +2287,7 @@ allowing the document to be loaded. Fix: Change parent class for Formula to Text ### Features * Add `AzureBlobStorageConnector` based on its `fsspec` implementation inheriting -from `FsspecConnector` + from `FsspecConnector` * Add `partition_epub` for partitioning e-books in EPUB3 format. ### Fixes @@ -2312,16 +2320,16 @@ from `FsspecConnector` * Fully move from printing to logging. * `unstructured-ingest` now uses a default `--download_dir` of `$HOME/.cache/unstructured/ingest` -rather than a "tmp-ingest-" dir in the working directory. + rather than a "tmp-ingest-" dir in the working directory. ### Features ### Fixes * `setup_ubuntu.sh` no longer fails in some contexts by interpreting -`DEBIAN_FRONTEND=noninteractive` as a command + `DEBIAN_FRONTEND=noninteractive` as a command * `unstructured-ingest` no longer re-downloads files when --preserve-downloads -is used without --download-dir. + is used without --download-dir. * Fixed an issue that was causing text to be skipped in some HTML documents. ## 0.5.1 @@ -2498,7 +2506,7 @@ is used without --download-dir. * Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files. * Helper functions for identifying and extracting phone numbers * Add new function `extract_attachment_info` that extracts and decodes the attachment -of an email. + of an email. * Staging brick to convert a list of `Element`s to a `pandas` dataframe. * Add plain text functionality to `partition_email` diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index 4a152264ff..50db8bee17 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -106,20 +106,30 @@ def test_calculate_edit_distance(): @pytest.mark.parametrize( - ("filename", "expected_score", "expected_distance"), + ("filename", "standardize_whitespaces", "expected_score", "expected_distance"), [ - ("fake-text.txt", 0.78, 38), + ("fake-text.txt", False, 0.78, 38), + ("fake-text.txt", True, 0.92, 12), ], ) -def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance): +def test_calculate_edit_distance_with_filename( + filename, standardize_whitespaces, expected_score, expected_distance +): with open("example-docs/fake-text.txt") as f: source_cct = f.read() elements = partition(filename=f"example-docs/{filename}") output_cct = "\n".join([str(el) for el in elements]) - score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") - distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") + score = text_extraction.calculate_edit_distance( + output_cct, source_cct, return_as="score", standardize_whitespaces=standardize_whitespaces + ) + distance = text_extraction.calculate_edit_distance( + output_cct, + source_cct, + return_as="distance", + standardize_whitespaces=standardize_whitespaces, + ) assert score >= 0 assert score <= 1.0 @@ -128,6 +138,109 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte assert distance == expected_distance +@pytest.mark.parametrize( + ("text1", "text2"), + [ + ( + "The dog\rloved the cat, but\t\n the cat\tloved the\n cow", + "The dog loved the cat, but the cat loved the cow", + ), + ( + "Hello my\tname\tis H a r p e r, \nwhat's your\vname?", + "Hello my name is H a r p e r, what's your name?", + ), + ( + "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.", + "I have a dog and a cat, I love my dog.", + ), + ( + """ + Name Age City Occupation + Alice 30 New York Engineer + Bob 25 Los Angeles Designer + Charlie 35 Chicago Teacher + David 40 San Francisco Developer + """, + """ + Name\tAge\tCity\tOccupation + Alice\t30\tNew York\tEngineer + Bob\t25\tLos Angeles\tDesigner + Charlie\t35\tChicago\tTeacher + David\t40\tSan Francisco\tDeveloper + """, + ), + ( + """ + Name\tAge\tCity\tOccupation + Alice\t30\tNew York\tEngineer + Bob\t25\tLos Angeles\tDesigner + Charlie\t35\tChicago\tTeacher + David\t40\tSan Francisco\tDeveloper + """, + "Name\tAge\tCity\tOccupation\n\n \nAlice\t30\tNew York\tEngineer\nBob\t25\tLos Angeles\tDesigner\nCharlie\t35\tChicago\tTeacher\nDavid\t40\tSan Francisco\tDeveloper", # noqa: E501 + ), + ], +) +def test_calculate_edit_distance_with_various_whitespace_1(text1, text2): + assert ( + text_extraction.calculate_edit_distance( + text1, text2, return_as="score", standardize_whitespaces=True + ) + == 1.0 + ) + assert ( + text_extraction.calculate_edit_distance( + text1, text2, return_as="distance", standardize_whitespaces=True + ) + == 0 + ) + assert ( + text_extraction.calculate_edit_distance( + text1, text2, return_as="score", standardize_whitespaces=False + ) + < 1.0 + ) + assert ( + text_extraction.calculate_edit_distance( + text1, text2, return_as="distance", standardize_whitespaces=False + ) + > 0 + ) + + +def test_calculate_edit_distance_with_various_whitespace_2(): + source_cct_tabs = """ + Name\tAge\tCity\tOccupation + Alice\t30\tNew York\tEngineer + Bob\t25\tLos Angeles\tDesigner + Charlie\t35\tChicago\tTeacher + David\t40\tSan Francisco\tDeveloper + """ + source_cct_with_borders = """ + + | Name | Age | City | Occupation | + |---------|-----|--------------|----------------| + | Alice | 30 | New York | Engineer | + | Bob | 25 | Los Angeles | Designer | + | Charlie | 35 | Chicago | Teacher | + | David | 40 | San Francisco| Developer | + + """ + assert text_extraction.calculate_edit_distance( + source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=True + ) > text_extraction.calculate_edit_distance( + source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=False + ) + assert text_extraction.calculate_edit_distance( + source_cct_tabs, source_cct_with_borders, return_as="distance", standardize_whitespaces=True + ) < text_extraction.calculate_edit_distance( + source_cct_tabs, + source_cct_with_borders, + return_as="distance", + standardize_whitespaces=False, + ) + + @pytest.mark.parametrize( ("text", "expected"), [ @@ -187,6 +300,46 @@ def test_bag_of_words(text, expected): assert text_extraction.bag_of_words(text) == expected +@pytest.mark.parametrize( + ("text", "expected"), + [ + ( + "The dog\rloved the cat, but\t\n the cat\tloved the\n cow\n\n", + "The dog loved the cat, but the cat loved the cow", + ), + ( + "\n\nHello my\tname\tis H a r p e r, \nwhat's your\vname?", + "Hello my name is H a r p e r, what's your name?", + ), + ( + "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.", + "I have a dog and a cat, I love my dog.", + ), + ( + """L is for the way you look at me + O is for the only one I see + V is very, very extraordinary + E is even more than anyone that you adore can""", + "L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can", # noqa: E501 + ), + ( + """ + | Name | Age | City | Occupation | + |---------|-----|--------------|----------------| + | Alice | 30 | New York | Engineer | + | Bob | 25 | Los Angeles | Designer | + | Charlie | 35 | Chicago | Teacher | + | David | 40 | San Francisco| Developer | + """, + "| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |", # noqa: E501 + ), + ], +) +def test_prepare_string(text, expected): + assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected + assert text_extraction.prepare_str(text) == text + + @pytest.mark.parametrize( ("output_text", "source_text", "expected_percentage"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce553d47b1..ea305a9cdd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.2-dev1" # pragma: no cover +__version__ = "0.16.2-dev2" # pragma: no cover diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 1b1af4ad38..dd2fa721b1 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -22,6 +22,7 @@ def calculate_edit_distance( source: Optional[str], weights: Tuple[int, int, int] = (2, 1, 1), return_as: str = "distance", + standardize_whitespaces: bool = True, ) -> float: """ Calculates edit distance using Levenshtein distance between two strings. @@ -56,8 +57,8 @@ def calculate_edit_distance( return_types = ["score", "distance"] if return_as not in return_types: raise ValueError("Invalid return value type. Expected one of: %s" % return_types) - output = _prepare_str(output) - source = _prepare_str(source) + output = prepare_str(output, standardize_whitespaces) + source = prepare_str(source, standardize_whitespaces) distance = Levenshtein.distance(output, source, weights=weights) # type: ignore # lower bounded the char length for source string at 1.0 because to avoid division by zero # in the case where source string is empty, the distance should be at 100% @@ -127,8 +128,8 @@ def calculate_percent_missing_text( Returns the percentage of missing text represented as a decimal between 0 and 1. """ - output = _prepare_str(output) - source = _prepare_str(source) + output = prepare_str(output) + source = prepare_str(source) output_bow = bag_of_words(output) source_bow = bag_of_words(source) @@ -153,7 +154,9 @@ def calculate_percent_missing_text( return min(fraction_missing, 1) # limit to 100% -def _prepare_str(string: Optional[str]) -> str: +def prepare_str(string: Optional[str], standardize_whitespaces: bool = False) -> str: if not string: return "" + if standardize_whitespaces: + return " ".join(string.split()) return str(string) # type: ignore