Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Easy persistent cache with new ab.get_colab_cache helper function #361

Merged
merged 7 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions airbyte/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
from airbyte import (
caches,
cloud,
constants,
datasets,
destinations,
documents,
Expand All @@ -139,7 +140,7 @@
)
from airbyte.caches.bigquery import BigQueryCache
from airbyte.caches.duckdb import DuckDBCache
from airbyte.caches.util import get_default_cache, new_local_cache
from airbyte.caches.util import get_colab_cache, get_default_cache, new_local_cache
from airbyte.datasets import CachedDataset
from airbyte.destinations.base import Destination
from airbyte.destinations.util import get_destination
Expand All @@ -154,8 +155,9 @@

__all__ = [
# Modules
"cloud",
"caches",
"cloud",
"constants",
"datasets",
"destinations",
"documents",
Expand All @@ -169,6 +171,7 @@
"sources",
# Factories
"get_available_connectors",
"get_colab_cache",
"get_default_cache",
"get_destination",
"get_secret",
Expand Down
5 changes: 3 additions & 2 deletions airbyte/caches/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from __future__ import annotations

from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, final

import pandas as pd
Expand All @@ -13,6 +12,7 @@

from airbyte_protocol.models import ConfiguredAirbyteCatalog

from airbyte import constants
from airbyte._writers.base import AirbyteWriterInterface
from airbyte.caches._catalog_backend import CatalogBackendBase, SqlCatalogBackend
from airbyte.caches._state_backend import SqlStateBackend
Expand All @@ -28,6 +28,7 @@

if TYPE_CHECKING:
from collections.abc import Iterator
from pathlib import Path

from airbyte._message_iterators import AirbyteMessageIterator
from airbyte.caches._state_backend_base import StateBackendBase
Expand All @@ -50,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface):
to the SQL backend specified in the `SqlConfig` class.
"""

cache_dir: Path = Field(default=Path(".cache"))
cache_dir: Path = Field(default_factory=lambda: constants.DEFAULT_CACHE_ROOT)
aaronsteers marked this conversation as resolved.
Show resolved Hide resolved
"""The directory to store the cache in."""

cleanup: bool = TEMP_FILE_CLEANUP
Expand Down
97 changes: 97 additions & 0 deletions airbyte/caches/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
from airbyte.caches.duckdb import DuckDBCache


# Google drive constants:

_MY_DRIVE = "MyDrive"
"""The default name of the user's personal Google Drive."""

_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive"
"""The recommended path to mount Google Drive to."""


# Utility functions:


def get_default_cache() -> DuckDBCache:
"""Get a local cache for storing data, using the default database path.

Expand Down Expand Up @@ -63,3 +75,88 @@ def new_local_cache(
cache_dir=cache_dir,
cleanup=cleanup,
)


def get_colab_cache(
cache_name: str = "default_cache",
sub_dir: str = "Airbyte/cache",
schema_name: str = "main",
table_prefix: str | None = "",
drive_name: str = _MY_DRIVE,
mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH,
) -> DuckDBCache:
"""Get a local cache for storing data, using the default database path.

Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple
Colab sessions.

Please note that Google Colab may prompt you to authenticate with your Google account to access
your Google Drive. When prompted, click the link and follow the instructions.

Colab will require access to read and write files in your Google Drive, so please be sure to
grant the necessary permissions when prompted.

All arguments are optional and have default values that are suitable for most use cases.

Args:
cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you
want to use a different database for different projects.
sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this
if you want to store the cache in a different subdirectory than the default.
schema_name: The name of the schema to write to. Defaults to "main". Override this if you
want to write to a different schema.
table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this
if you want to use a different prefix for all tables.
drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you
want to store data in a shared drive instead of your personal drive.
mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this
if you want to mount Google Drive to a different path (not recommended).

## Usage Examples

The default `get_colab_cache` arguments are suitable for most use cases:

```python
from airbyte.caches.colab import get_colab_cache

colab_cache = get_colab_cache()
```

Or you can call `get_colab_cache` with custom arguments:

```python
custom_cache = get_colab_cache(
cache_name="my_custom_cache",
sub_dir="Airbyte/custom_cache",
drive_name="My Company Drive",
)
```
"""
try:
from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports]
except ImportError:
drive = None
msg = (
"The `google.colab` interface is only available in Google Colab. "
"Please run this code in a Google Colab notebook."
)
raise ImportError(msg) from None

drive.mount(mount_path)
drive_root = (
Path(mount_path) / drive_name
if drive_name == _MY_DRIVE
else Path(mount_path) / "Shareddrives" / drive_name
)

cache_dir = drive_root / sub_dir
cache_dir.mkdir(parents=True, exist_ok=True)
db_file_path = cache_dir / f"{cache_name}.duckdb"

print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.")
return DuckDBCache(
db_path=db_file_path,
cache_dir=cache_dir,
schema_name=schema_name,
table_prefix=table_prefix,
)
15 changes: 15 additions & 0 deletions airbyte/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

import os
from pathlib import Path


DEBUG_MODE = False # Set to True to enable additional debug logging.
Expand Down Expand Up @@ -41,6 +42,20 @@
Specific caches may override this value with a different schema name.
"""

DEFAULT_CACHE_ROOT: Path = (
Path() / ".cache"
if "AIRBYTE_CACHE_ROOT" not in os.environ
else Path(os.environ["AIRBYTE_CACHE_ROOT"])
)
"""Default cache root is `.cache` in the current working directory.

The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable.

Overriding this can be useful if you always want to store cache files in a specific location.
For example, in ephemeral environments like Google Colab, you might want to store cache files in
your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`.
"""

DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000
"""The default number of records to include in each batch of an Arrow dataset."""

Expand Down
Loading