first pass, wip

airbytehq · Feb 17, 2024 · a6f00fd · a6f00fd
1 parent 182635a
commit a6f00fd
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 0 deletions.
diff --git a/airbyte/datasets/_base.py b/airbyte/datasets/_base.py
@@ -7,6 +7,8 @@
 
 from pandas import DataFrame
 
+from airbyte.documents import Document
+
 
 class DatasetBase(ABC):
     """Base implementation for all datasets."""
@@ -25,3 +27,7 @@ def to_pandas(self) -> DataFrame:
         # expects an iterator of dict objects. This cast is safe because we know
         # duck typing is correct for this use case.
         return DataFrame(cast(Iterator[dict[str, Any]], self))
+
+    def to_documents(self) -> Iterator[Document]:
+        """Return the iterator of documents."""
+        return Document.from_records(self.__iter__())
diff --git a/airbyte/documents.py b/airbyte/documents.py
@@ -0,0 +1,91 @@
+"""Methods for converting Airbyte records into documents.
+
+This module is modeled after the LangChain project's `Documents` class:
+- https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py
+"""
+from __future__ import annotations
+
+import textwrap
+from typing import TYPE_CHECKING, Any
+
+from pydantic import BaseModel
+
+
+MAX_SINGLE_LINE_LENGTH = 60
+
+if TYPE_CHECKING:
+    import datetime
+    from collections.abc import Iterable
+
+
+class Document(BaseModel):
+    """A PyAirbyte document is a specific projection on top of a record.
+
+    Documents have the following structure:
+    - id (str): A unique string identifier for the document.
+    - content (str): A string representing the record when rendered as a document.
+    - metadata (dict[str, Any]): Associated metadata about the document, such as the record's IDs
+      and/or URLs.
+
+    This class is modeled after the LangChain project's `Document` class.
+
+    TODO:
+    - Decide if we need to rename 'content' to 'page_content' in order to match LangChain name.
+    """
+
+    id: str
+    content: str
+    metadata: dict[str, Any]
+    last_modified: datetime.datetime | None = None
+
+    @classmethod
+    def from_records(cls, records: Iterable[dict[str, Any]]) -> Iterable[Document]:
+        """Create an iterable of documents from an iterable of records."""
+        yield from {cls.from_record(record) for record in records}
+
+    @classmethod
+    def from_record(cls, record: dict[str, Any]) -> Document:
+        """Create a document from a record.
+
+        TODO:
+        - Parse 'id' from primary key records, if available. Otherwise hash the record data.
+        - Parse 'last_modified' from the record, when available.
+        - Add a convention to let the source define how 'content' should be rendered. In
+          that case, the default rendering behavior below would become the fallback.
+        - Add smarter logic for deciding which fields are metadata and which are content. In this
+          first version, we assume that all string fields are content and all other fields are
+          metadata - which doesn't work well for URLs, IDs, and many other field types.
+        """
+        primary_keys: list[str] = []  # TODO: Get the actual primary keys here.
+        document_fields: list[str] = [
+            property_name for property_name, value in record.values() if isinstance(value, str)
+        ]
+        metadata_fields = set(record.keys()) - set(document_fields)
+        doc_id: str = (
+            "-".join(str(record[key]) for key in primary_keys)
+            if primary_keys
+            else str(hash(record))
+        )
+        last_modified_key = None  # TODO: Get the actual last modified key here, when available.
+
+        # Short content is rendered as a single line, while long content is rendered as a indented
+        # multi-line string with a 100 character width.
+        content = "\n".join(
+            f"{key}: {value}"
+            if len(value) > MAX_SINGLE_LINE_LENGTH
+            else f"{key}: \n{textwrap.wrap(
+                value,
+                width=100,
+                initial_indent=' ' * 4,
+                subsequent_indent=' ' * 4,
+                break_long_words=False,
+            )}"
+            for key, value in record.items()
+            if key in document_fields
+        )
+        return cls(
+            id=doc_id,
+            content=content,
+            metadata={key: record[key] for key in metadata_fields},
+            last_modified=record[last_modified_key] if last_modified_key else None,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -136,6 +136,7 @@ ignore = [
     # These we don't agree with or don't want to prioritize to enforce:
     "ANN003",  # kwargs missing type annotations
     "ANN101",  # Type annotations for 'self' args
+    "ANN102",  # Type annotations for 'cls' args
     "COM812", # Because it conflicts with ruff auto-format
     "EM", # flake8-errmsgs (may reconsider later)
     "DJ", # Django linting