Skip to content

Commit

Permalink
first pass, wip
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronsteers committed Feb 17, 2024
1 parent 182635a commit a6f00fd
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 0 deletions.
6 changes: 6 additions & 0 deletions airbyte/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from pandas import DataFrame

from airbyte.documents import Document


class DatasetBase(ABC):
"""Base implementation for all datasets."""
Expand All @@ -25,3 +27,7 @@ def to_pandas(self) -> DataFrame:
# expects an iterator of dict objects. This cast is safe because we know
# duck typing is correct for this use case.
return DataFrame(cast(Iterator[dict[str, Any]], self))

def to_documents(self) -> Iterator[Document]:
"""Return the iterator of documents."""
return Document.from_records(self.__iter__())
91 changes: 91 additions & 0 deletions airbyte/documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Methods for converting Airbyte records into documents.
This module is modeled after the LangChain project's `Documents` class:
- https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py
"""
from __future__ import annotations

import textwrap
from typing import TYPE_CHECKING, Any

from pydantic import BaseModel


MAX_SINGLE_LINE_LENGTH = 60

if TYPE_CHECKING:
import datetime
from collections.abc import Iterable


class Document(BaseModel):
"""A PyAirbyte document is a specific projection on top of a record.
Documents have the following structure:
- id (str): A unique string identifier for the document.
- content (str): A string representing the record when rendered as a document.
- metadata (dict[str, Any]): Associated metadata about the document, such as the record's IDs
and/or URLs.
This class is modeled after the LangChain project's `Document` class.
TODO:
- Decide if we need to rename 'content' to 'page_content' in order to match LangChain name.
"""

id: str
content: str
metadata: dict[str, Any]
last_modified: datetime.datetime | None = None

@classmethod
def from_records(cls, records: Iterable[dict[str, Any]]) -> Iterable[Document]:
"""Create an iterable of documents from an iterable of records."""
yield from {cls.from_record(record) for record in records}

@classmethod
def from_record(cls, record: dict[str, Any]) -> Document:
"""Create a document from a record.
TODO:
- Parse 'id' from primary key records, if available. Otherwise hash the record data.
- Parse 'last_modified' from the record, when available.
- Add a convention to let the source define how 'content' should be rendered. In
that case, the default rendering behavior below would become the fallback.
- Add smarter logic for deciding which fields are metadata and which are content. In this
first version, we assume that all string fields are content and all other fields are
metadata - which doesn't work well for URLs, IDs, and many other field types.
"""
primary_keys: list[str] = [] # TODO: Get the actual primary keys here.
document_fields: list[str] = [
property_name for property_name, value in record.values() if isinstance(value, str)
]
metadata_fields = set(record.keys()) - set(document_fields)
doc_id: str = (
"-".join(str(record[key]) for key in primary_keys)
if primary_keys
else str(hash(record))
)
last_modified_key = None # TODO: Get the actual last modified key here, when available.

# Short content is rendered as a single line, while long content is rendered as a indented
# multi-line string with a 100 character width.
content = "\n".join(
f"{key}: {value}"
if len(value) > MAX_SINGLE_LINE_LENGTH
else f"{key}: \n{textwrap.wrap(
value,
width=100,
initial_indent=' ' * 4,
subsequent_indent=' ' * 4,
break_long_words=False,
)}"
for key, value in record.items()
if key in document_fields
)
return cls(
id=doc_id,
content=content,
metadata={key: record[key] for key in metadata_fields},
last_modified=record[last_modified_key] if last_modified_key else None,
)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ ignore = [
# These we don't agree with or don't want to prioritize to enforce:
"ANN003", # kwargs missing type annotations
"ANN101", # Type annotations for 'self' args
"ANN102", # Type annotations for 'cls' args
"COM812", # Because it conflicts with ruff auto-format
"EM", # flake8-errmsgs (may reconsider later)
"DJ", # Django linting
Expand Down

0 comments on commit a6f00fd

Please sign in to comment.