From 09229486cc9cc08353c208031561b011a21a4d75 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 16 Jul 2024 11:59:14 -0700 Subject: [PATCH] rfctr(file): extract file-type domain model Extract file-type descriptor data items into a separate `.model` module. --- unstructured/file_utils/filetype.py | 233 ++------------------------- unstructured/file_utils/model.py | 239 ++++++++++++++++++++++++++++ unstructured/partition/auto.py | 2 +- 3 files changed, 255 insertions(+), 219 deletions(-) create mode 100644 unstructured/file_utils/model.py diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index c092f62464..ca7758d1e0 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -1,6 +1,5 @@ from __future__ import annotations -import enum import functools import importlib.util import json @@ -13,6 +12,13 @@ from unstructured.documents.elements import Element from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str +from unstructured.file_utils.model import ( + EXT_TO_FILETYPE, + FILETYPE_TO_MIMETYPE, + PLAIN_TEXT_EXTENSIONS, + STR_TO_FILETYPE, + FileType, +) from unstructured.logger import logger from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common import ( @@ -25,219 +31,6 @@ LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic")) -TXT_MIME_TYPES = [ - "text/plain", - "message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822 -] - -# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. -# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by -# looking for expected filenames within the zip file. -EXPECTED_DOCX_FILES = [ - "docProps/core.xml", - "word/document.xml", -] - -EXPECTED_XLSX_FILES = [ - "xl/workbook.xml", -] - -EXPECTED_PPTX_FILES = [ - "docProps/core.xml", - "ppt/presentation.xml", -] - - -class FileType(enum.Enum): - UNK = 0 - EMPTY = 1 - - # MS Office Types - DOC = 10 - DOCX = 11 - XLS = 12 - XLSX = 13 - PPT = 14 - PPTX = 15 - MSG = 16 - - # Adobe Types - PDF = 20 - - # Image Types - JPG = 30 - PNG = 31 - TIFF = 32 - BMP = 33 - HEIC = 34 - - # Plain Text Types - EML = 40 - RTF = 41 - TXT = 42 - JSON = 43 - CSV = 44 - TSV = 45 - - # Markup Types - HTML = 50 - XML = 51 - MD = 52 - EPUB = 53 - RST = 54 - ORG = 55 - - # Compressed Types - ZIP = 60 - - # Open Office Types - ODT = 70 - - # Audio Files - WAV = 80 - - def __lt__(self, other: FileType) -> bool: - """Makes `FileType` members comparable with relational operators, at least with `<`. - - This makes them sortable, in particular it supports sorting for pandas groupby functions. - """ - return self.name < other.name - - -STR_TO_FILETYPE = { - "application/pdf": FileType.PDF, - "application/msword": FileType.DOC, - "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX, - "image/jpeg": FileType.JPG, - "image/png": FileType.PNG, - "image/heic": FileType.HEIC, - "image/tiff": FileType.TIFF, - "image/bmp": FileType.BMP, - # NOTE(robinson) - https://mimetype.io/application/yaml - # In the future, we may have special processing for YAML - # files instead of treating them as plaintext - "application/yaml": FileType.TXT, - "application/x-yaml": FileType.TXT, - "text/x-yaml": FileType.TXT, - "text/yaml": FileType.TXT, - "text/plain": FileType.TXT, - "text/x-csv": FileType.CSV, - "application/csv": FileType.CSV, - "application/x-csv": FileType.CSV, - "text/comma-separated-values": FileType.CSV, - "text/x-comma-separated-values": FileType.CSV, - "text/csv": FileType.CSV, - "text/tsv": FileType.TSV, - "text/markdown": FileType.MD, - "text/x-markdown": FileType.MD, - "text/org": FileType.ORG, - "text/x-rst": FileType.RST, - "application/epub": FileType.EPUB, - "application/epub+zip": FileType.EPUB, - "application/json": FileType.JSON, - "application/rtf": FileType.RTF, - "text/rtf": FileType.RTF, - "text/html": FileType.HTML, - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX, - "application/vnd.ms-excel": FileType.XLS, - "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX, - "application/vnd.ms-powerpoint": FileType.PPT, - "application/xml": FileType.XML, - "application/vnd.oasis.opendocument.text": FileType.ODT, - "message/rfc822": FileType.EML, - "application/x-ole-storage": FileType.MSG, - "application/vnd.ms-outlook": FileType.MSG, - # NOTE(robinson) - https://mimetype.io/audio/wav - "audio/vnd.wav": FileType.WAV, - "audio/vnd.wave": FileType.WAV, - "audio/wave": FileType.WAV, - "audio/x-pn-wav": FileType.WAV, - "audio/x-wav": FileType.WAV, - "inode/x-empty": FileType.EMPTY, -} - -MIMETYPES_TO_EXCLUDE = [ - "text/x-markdown", - "application/epub+zip", - "text/x-csv", - "application/csv", - "application/x-csv", - "text/comma-separated-values", - "text/x-comma-separated-values", -] - -FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE} - -EXT_TO_FILETYPE = { - ".pdf": FileType.PDF, - ".docx": FileType.DOCX, - ".jpg": FileType.JPG, - ".jpeg": FileType.JPG, - ".txt": FileType.TXT, - ".text": FileType.TXT, - ".log": FileType.TXT, - ".eml": FileType.EML, - ".xml": FileType.XML, - ".heic": FileType.HEIC, - ".htm": FileType.HTML, - ".html": FileType.HTML, - ".md": FileType.MD, - ".org": FileType.ORG, - ".rst": FileType.RST, - ".xlsx": FileType.XLSX, - ".pptx": FileType.PPTX, - ".p7s": FileType.EML, - ".png": FileType.PNG, - ".doc": FileType.DOC, - ".zip": FileType.ZIP, - ".xls": FileType.XLS, - ".ppt": FileType.PPT, - ".rtf": FileType.RTF, - ".json": FileType.JSON, - ".epub": FileType.EPUB, - ".msg": FileType.MSG, - ".odt": FileType.ODT, - ".csv": FileType.CSV, - ".tsv": FileType.TSV, - ".tab": FileType.TSV, - ".tiff": FileType.TIFF, - ".bmp": FileType.BMP, - ".wav": FileType.WAV, - # NOTE(robinson) - for now we are treating code files as plain text - ".js": FileType.TXT, - ".py": FileType.TXT, - ".java": FileType.TXT, - ".cpp": FileType.TXT, - ".cc": FileType.TXT, - ".cxx": FileType.TXT, - ".c": FileType.TXT, - ".cs": FileType.TXT, - ".php": FileType.TXT, - ".rb": FileType.TXT, - ".swift": FileType.TXT, - ".ts": FileType.TXT, - ".go": FileType.TXT, - ".yaml": FileType.TXT, - ".yml": FileType.TXT, - None: FileType.UNK, -} - -PLAIN_TEXT_EXTENSIONS = [ - ".txt", - ".text", - ".eml", - ".p7s", - ".md", - ".rtf", - ".html", - ".rst", - ".org", - ".csv", - ".tsv", - ".tab", - ".json", -] - def detect_filetype( filename: Optional[str] = None, @@ -317,7 +110,8 @@ def detect_filetype( else: return FileType.XML - elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"): + # -- ref: https://www.rfc-editor.org/rfc/rfc822 -- + elif mime_type == "message/rfc822" or mime_type.startswith("text"): if not encoding: encoding = "utf-8" formatted_encoding = format_encoding_str(encoding) @@ -450,12 +244,15 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType: file.seek(0) archive = zipfile.ZipFile(file) + # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. + # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by + # looking for expected filenames within the zip file. archive_filenames = [f.filename for f in archive.filelist] - if all(f in archive_filenames for f in EXPECTED_DOCX_FILES): + if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")): return FileType.DOCX - elif all(f in archive_filenames for f in EXPECTED_XLSX_FILES): + elif all(f in archive_filenames for f in ("xl/workbook.xml",)): return FileType.XLSX - elif all(f in archive_filenames for f in EXPECTED_PPTX_FILES): + elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")): return FileType.PPTX if LIBMAGIC_AVAILABLE: diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py new file mode 100644 index 0000000000..068ee5c5f1 --- /dev/null +++ b/unstructured/file_utils/model.py @@ -0,0 +1,239 @@ +"""Domain-model for file-types.""" + +from __future__ import annotations + +import enum + + +class FileType(enum.Enum): + UNK = 0 + EMPTY = 1 + + # MS Office Types + DOC = 10 + DOCX = 11 + XLS = 12 + XLSX = 13 + PPT = 14 + PPTX = 15 + MSG = 16 + + # Adobe Types + PDF = 20 + + # Image Types + JPG = 30 + PNG = 31 + TIFF = 32 + BMP = 33 + HEIC = 34 + + # Plain Text Types + EML = 40 + RTF = 41 + TXT = 42 + JSON = 43 + CSV = 44 + TSV = 45 + + # Markup Types + HTML = 50 + XML = 51 + MD = 52 + EPUB = 53 + RST = 54 + ORG = 55 + + # Compressed Types + ZIP = 60 + + # Open Office Types + ODT = 70 + + # Audio Files + WAV = 80 + + def __lt__(self, other: FileType) -> bool: + """Makes `FileType` members comparable with relational operators, at least with `<`. + + This makes them sortable, in particular it supports sorting for pandas groupby functions. + """ + return self.name < other.name + + +STR_TO_FILETYPE = { + # -- BMP -- + "image/bmp": FileType.BMP, + # -- CSV -- + "application/csv": FileType.CSV, + "application/x-csv": FileType.CSV, + "text/comma-separated-values": FileType.CSV, + "text/csv": FileType.CSV, + "text/x-comma-separated-values": FileType.CSV, + "text/x-csv": FileType.CSV, + # -- DOC -- + "application/msword": FileType.DOC, + # -- DOCX -- + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX, + # -- EML -- + "message/rfc822": FileType.EML, + # -- EPUB -- + "application/epub": FileType.EPUB, + "application/epub+zip": FileType.EPUB, + # -- HEIF -- + "image/heic": FileType.HEIC, + # -- HTML -- + "text/html": FileType.HTML, + # -- JPG -- + "image/jpeg": FileType.JPG, + # -- JSON -- + "application/json": FileType.JSON, + # -- MD -- + "text/markdown": FileType.MD, + "text/x-markdown": FileType.MD, + # -- MSG -- + "application/vnd.ms-outlook": FileType.MSG, + "application/x-ole-storage": FileType.MSG, + # -- ODT -- + "application/vnd.oasis.opendocument.text": FileType.ODT, + # -- ORG -- + "text/org": FileType.ORG, + # -- PDF -- + "application/pdf": FileType.PDF, + # -- PNG -- + "image/png": FileType.PNG, + # -- PPT -- + "application/vnd.ms-powerpoint": FileType.PPT, + # -- PPTX -- + "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX, + # -- RST -- + "text/x-rst": FileType.RST, + # -- RTF -- + "application/rtf": FileType.RTF, + "text/rtf": FileType.RTF, + # -- TIFF -- + "image/tiff": FileType.TIFF, + # -- TSV -- + "text/tsv": FileType.TSV, + # -- TXT -- + "text/plain": FileType.TXT, + # NOTE(robinson) - https://mimetype.io/application/yaml + # In the future, we may have special processing for YAML + # files instead of treating them as plaintext + "application/x-yaml": FileType.TXT, + "application/yaml": FileType.TXT, + "text/x-yaml": FileType.TXT, + "text/yaml": FileType.TXT, + # -- WAV -- + # NOTE(robinson) - https://mimetype.io/audio/wav + "audio/vnd.wav": FileType.WAV, + "audio/vnd.wave": FileType.WAV, + "audio/wave": FileType.WAV, + "audio/x-pn-wav": FileType.WAV, + "audio/x-wav": FileType.WAV, + # -- XLS -- + "application/vnd.ms-excel": FileType.XLS, + # -- XLSX -- + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX, + # -- XML -- + "application/xml": FileType.XML, + # -- EMPTY -- + "inode/x-empty": FileType.EMPTY, +} + +MIMETYPES_TO_EXCLUDE = [ + "application/csv", + "application/epub+zip", + "application/x-csv", + "text/comma-separated-values", + "text/x-comma-separated-values", + "text/x-csv", + "text/x-markdown", +] + +FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE} + +EXT_TO_FILETYPE = { + # -- BMP -- + ".bmp": FileType.BMP, + # -- CSV -- + ".csv": FileType.CSV, + # -- DOC -- + ".doc": FileType.DOC, + # -- DOCX -- + ".docx": FileType.DOCX, + # -- EML -- + ".eml": FileType.EML, + ".p7s": FileType.EML, + # -- EPUB -- + ".epub": FileType.EPUB, + # -- HEIC -- + ".heic": FileType.HEIC, + # -- HTML -- + ".htm": FileType.HTML, + ".html": FileType.HTML, + # -- JPG -- + ".jpeg": FileType.JPG, + ".jpg": FileType.JPG, + # -- JSON -- + ".json": FileType.JSON, + # -- MD -- + ".md": FileType.MD, + # -- MSG -- + ".msg": FileType.MSG, + # -- ODT -- + ".odt": FileType.ODT, + # -- ORG -- + ".org": FileType.ORG, + # -- PDF -- + ".pdf": FileType.PDF, + # -- PNG -- + ".png": FileType.PNG, + # -- PPT -- + ".ppt": FileType.PPT, + # -- PPTX -- + ".pptx": FileType.PPTX, + # -- RST -- + ".rst": FileType.RST, + # -- RTF -- + ".rtf": FileType.RTF, + # -- TIFF -- + ".tiff": FileType.TIFF, + # -- TSV -- + ".tab": FileType.TSV, + ".tsv": FileType.TSV, + # -- TXT -- + ".text": FileType.TXT, + ".txt": FileType.TXT, + # NOTE(robinson) - for now we are treating code files as plain text + ".c": FileType.TXT, + ".cc": FileType.TXT, + ".cpp": FileType.TXT, + ".cs": FileType.TXT, + ".cxx": FileType.TXT, + ".go": FileType.TXT, + ".java": FileType.TXT, + ".js": FileType.TXT, + ".log": FileType.TXT, + ".php": FileType.TXT, + ".py": FileType.TXT, + ".rb": FileType.TXT, + ".swift": FileType.TXT, + ".ts": FileType.TXT, + ".yaml": FileType.TXT, + ".yml": FileType.TXT, + # -- WAV -- + ".wav": FileType.WAV, + # -- XLS -- + ".xls": FileType.XLS, + # -- XLSX -- + ".xlsx": FileType.XLSX, + # -- XML -- + ".xml": FileType.XML, + # -- ZIP -- + ".zip": FileType.ZIP, + # -- UNK -- + None: FileType.UNK, +} + +PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split() diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index d4ed7c7130..f4d7434e14 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -11,10 +11,10 @@ from unstructured.file_utils.filetype import ( FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, - FileType, detect_filetype, is_json_processable, ) +from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.partition.common import exactly_one from unstructured.partition.email import partition_email