From f918782bf2d3b52c3d87655d20d5dc25c8cb1950 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 14 Sep 2024 13:27:24 -0700 Subject: [PATCH 1/7] feat: add get_colab_cache(), declare the constants module in __all__, add global cache default override --- airbyte/__init__.py | 4 +- airbyte/caches/base.py | 3 +- airbyte/caches/colab.py | 103 ++++++++++++++++++++++++++++++++++++++++ airbyte/constants.py | 15 ++++++ 4 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 airbyte/caches/colab.py diff --git a/airbyte/__init__.py b/airbyte/__init__.py index ab927734..e139bcf0 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -126,6 +126,7 @@ from airbyte import ( caches, cloud, + constants, datasets, destinations, documents, @@ -154,8 +155,9 @@ __all__ = [ # Modules - "cloud", "caches", + "cloud", + "constants", "datasets", "destinations", "documents", diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index cc168c71..2b020dfe 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -13,6 +13,7 @@ from airbyte_protocol.models import ConfiguredAirbyteCatalog +from airbyte import constants from airbyte._writers.base import AirbyteWriterInterface from airbyte.caches._catalog_backend import CatalogBackendBase, SqlCatalogBackend from airbyte.caches._state_backend import SqlStateBackend @@ -50,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface): to the SQL backend specified in the `SqlConfig` class. """ - cache_dir: Path = Field(default=Path(".cache")) + cache_dir: Path = Field(default_factory=lambda: constants.DEFAULT_CACHE_ROOT) """The directory to store the cache in.""" cleanup: bool = TEMP_FILE_CLEANUP diff --git a/airbyte/caches/colab.py b/airbyte/caches/colab.py new file mode 100644 index 00000000..ec1c9735 --- /dev/null +++ b/airbyte/caches/colab.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. +"""A cache implementation for Google Colab users, with files stored in Google Drive. + +This is a thin wrapper around the default `DuckDBCache` implementation, streamlining +the process of mounting Google Drive and setting up a persistent cache in Google Colab. + +Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple +Colab sessions. + +## Usage Examples + +The default `get_colab_cache` arguments are suitable for most use cases: + +```python +from airbyte.caches.colab import get_colab_cache + +colab_cache = get_colab_cache() +``` + +Or you can call `get_colab_cache` with custom arguments: + +```python +custom_cache = get_colab_cache( + cache_name="my_custom_cache", + sub_dir="Airbyte/custom_cache", + drive_name="My Company Drive", +) +``` +""" + +from __future__ import annotations + +from pathlib import Path + +from airbyte.caches.duckdb import DuckDBCache + + +try: + from google.colab import drive +except ImportError: + # The `GoogleColabCache` class is only available in Google Colab. + drive = None + +MY_DRIVE = "MyDrive" +"""The default name of the user's personal Google Drive.""" + +GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" +"""The recommended path to mount Google Drive to.""" + + +def get_colab_cache( + cache_name: str = "colab_cache", + sub_dir: str = "Airbyte/cache", + drive_name: str = MY_DRIVE, + mount_path: str = GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, +) -> DuckDBCache: + """Get a local cache for storing data, using the default database path. + + Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple + Colab sessions. + + Please note that Google Colab may prompt you to authenticate with your Google account to access + your Google Drive. When prompted, click the link and follow the instructions. + + Colab will require access to read and write files in your Google Drive, so please be sure to + grant the necessary permissions when prompted. + + All arguments are optional and have default values that are suitable for most use cases. + + Args: + cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you + want to use a different database for different projects. + sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this + if you want to store the cache in a different subdirectory than the default. + drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you + want to store data in a shared drive instead of your personal drive. + mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this + if you want to mount Google Drive to a different path (not recommended). + """ + if not drive: + msg = ( + "The `GoogleColabCache` class is only available in Google Colab. " + "Please run this code in a Google Colab notebook." + ) + raise ImportError(msg) + + drive.mount(mount_path) + print(f"Successfully mounted Google Drive at `{mount_path}`.") + drive_root = ( + Path(mount_path) / drive_name + if drive_name == MY_DRIVE + else Path(mount_path) / "Shareddrives" / drive_name + ) + + cache_dir = drive_root / sub_dir + cache_dir.mkdir(parents=True, exist_ok=True) + db_file_path = cache_dir / f"{cache_name}.duckdb" + + print(f"Creating persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") + return DuckDBCache( + db_path=db_file_path, + cache_dir=cache_dir, + ) diff --git a/airbyte/constants.py b/airbyte/constants.py index f4f4b969..94548d03 100644 --- a/airbyte/constants.py +++ b/airbyte/constants.py @@ -4,6 +4,7 @@ from __future__ import annotations import os +from pathlib import Path DEBUG_MODE = False # Set to True to enable additional debug logging. @@ -41,6 +42,20 @@ Specific caches may override this value with a different schema name. """ +DEFAULT_CACHE_ROOT: Path = ( + Path() / ".cache" + if "AIRBYTE_CACHE_ROOT" not in os.environ + else Path(os.environ["AIRBYTE_CACHE_ROOT"]) +) +"""Default cache root is `.cache` in the current working directory. + +The default location can be overridden by setting the `AIRBYTE_CACHE_ROOT` environment variable. + +Overriding this can be useful if you always want to store cache files in a specific location. +For example, in ephemeral environments like Google Colab, you might want to store cache files in +your mounted Google Drive by setting this to a path like `/content/drive/MyDrive/Airbyte/cache`. +""" + DEFAULT_ARROW_MAX_CHUNK_SIZE = 100_000 """The default number of records to include in each batch of an Arrow dataset.""" From baa8d230970618d50b280d9813afab9d27599352 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 14 Sep 2024 13:53:00 -0700 Subject: [PATCH 2/7] move colab method to util --- airbyte/__init__.py | 3 +- airbyte/caches/colab.py | 103 ---------------------------------------- airbyte/caches/util.py | 98 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 104 deletions(-) delete mode 100644 airbyte/caches/colab.py diff --git a/airbyte/__init__.py b/airbyte/__init__.py index e139bcf0..93037fea 100644 --- a/airbyte/__init__.py +++ b/airbyte/__init__.py @@ -140,7 +140,7 @@ ) from airbyte.caches.bigquery import BigQueryCache from airbyte.caches.duckdb import DuckDBCache -from airbyte.caches.util import get_default_cache, new_local_cache +from airbyte.caches.util import get_colab_cache, get_default_cache, new_local_cache from airbyte.datasets import CachedDataset from airbyte.destinations.base import Destination from airbyte.destinations.util import get_destination @@ -171,6 +171,7 @@ "sources", # Factories "get_available_connectors", + "get_colab_cache", "get_default_cache", "get_destination", "get_secret", diff --git a/airbyte/caches/colab.py b/airbyte/caches/colab.py deleted file mode 100644 index ec1c9735..00000000 --- a/airbyte/caches/colab.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2024 Airbyte, Inc., all rights reserved. -"""A cache implementation for Google Colab users, with files stored in Google Drive. - -This is a thin wrapper around the default `DuckDBCache` implementation, streamlining -the process of mounting Google Drive and setting up a persistent cache in Google Colab. - -Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple -Colab sessions. - -## Usage Examples - -The default `get_colab_cache` arguments are suitable for most use cases: - -```python -from airbyte.caches.colab import get_colab_cache - -colab_cache = get_colab_cache() -``` - -Or you can call `get_colab_cache` with custom arguments: - -```python -custom_cache = get_colab_cache( - cache_name="my_custom_cache", - sub_dir="Airbyte/custom_cache", - drive_name="My Company Drive", -) -``` -""" - -from __future__ import annotations - -from pathlib import Path - -from airbyte.caches.duckdb import DuckDBCache - - -try: - from google.colab import drive -except ImportError: - # The `GoogleColabCache` class is only available in Google Colab. - drive = None - -MY_DRIVE = "MyDrive" -"""The default name of the user's personal Google Drive.""" - -GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" -"""The recommended path to mount Google Drive to.""" - - -def get_colab_cache( - cache_name: str = "colab_cache", - sub_dir: str = "Airbyte/cache", - drive_name: str = MY_DRIVE, - mount_path: str = GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, -) -> DuckDBCache: - """Get a local cache for storing data, using the default database path. - - Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple - Colab sessions. - - Please note that Google Colab may prompt you to authenticate with your Google account to access - your Google Drive. When prompted, click the link and follow the instructions. - - Colab will require access to read and write files in your Google Drive, so please be sure to - grant the necessary permissions when prompted. - - All arguments are optional and have default values that are suitable for most use cases. - - Args: - cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you - want to use a different database for different projects. - sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this - if you want to store the cache in a different subdirectory than the default. - drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you - want to store data in a shared drive instead of your personal drive. - mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this - if you want to mount Google Drive to a different path (not recommended). - """ - if not drive: - msg = ( - "The `GoogleColabCache` class is only available in Google Colab. " - "Please run this code in a Google Colab notebook." - ) - raise ImportError(msg) - - drive.mount(mount_path) - print(f"Successfully mounted Google Drive at `{mount_path}`.") - drive_root = ( - Path(mount_path) / drive_name - if drive_name == MY_DRIVE - else Path(mount_path) / "Shareddrives" / drive_name - ) - - cache_dir = drive_root / sub_dir - cache_dir.mkdir(parents=True, exist_ok=True) - db_file_path = cache_dir / f"{cache_name}.duckdb" - - print(f"Creating persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") - return DuckDBCache( - db_path=db_file_path, - cache_dir=cache_dir, - ) diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index d1cf2128..73ec509c 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -11,6 +11,18 @@ from airbyte.caches.duckdb import DuckDBCache +# Google drive constants: + +_MY_DRIVE = "MyDrive" +"""The default name of the user's personal Google Drive.""" + +_GOOGLE_DRIVE_DEFAULT_MOUNT_PATH = "/content/drive" +"""The recommended path to mount Google Drive to.""" + + +# Utility functions: + + def get_default_cache() -> DuckDBCache: """Get a local cache for storing data, using the default database path. @@ -63,3 +75,89 @@ def new_local_cache( cache_dir=cache_dir, cleanup=cleanup, ) + + +def get_colab_cache( + cache_name: str = "default_cache", + sub_dir: str = "Airbyte/cache", + schema_name: str = "main", + table_prefix: str | None = "", + drive_name: str = _MY_DRIVE, + mount_path: str = _GOOGLE_DRIVE_DEFAULT_MOUNT_PATH, +) -> DuckDBCache: + """Get a local cache for storing data, using the default database path. + + Unlike the default `DuckDBCache`, this implementation will easily persist data across multiple + Colab sessions. + + Please note that Google Colab may prompt you to authenticate with your Google account to access + your Google Drive. When prompted, click the link and follow the instructions. + + Colab will require access to read and write files in your Google Drive, so please be sure to + grant the necessary permissions when prompted. + + All arguments are optional and have default values that are suitable for most use cases. + + Args: + cache_name: The name to use for the cache. Defaults to "colab_cache". Override this if you + want to use a different database for different projects. + sub_dir: The subdirectory to store the cache in. Defaults to "Airbyte/cache". Override this + if you want to store the cache in a different subdirectory than the default. + schema_name: The name of the schema to write to. Defaults to "main". Override this if you + want to write to a different schema. + table_prefix: The prefix to use for all tables in the cache. Defaults to "". Override this + if you want to use a different prefix for all tables. + drive_name: The name of the Google Drive to use. Defaults to "MyDrive". Override this if you + want to store data in a shared drive instead of your personal drive. + mount_path: The path to mount Google Drive to. Defaults to "/content/drive". Override this + if you want to mount Google Drive to a different path (not recommended). + + ## Usage Examples + + The default `get_colab_cache` arguments are suitable for most use cases: + + ```python + from airbyte.caches.colab import get_colab_cache + + colab_cache = get_colab_cache() + ``` + + Or you can call `get_colab_cache` with custom arguments: + + ```python + custom_cache = get_colab_cache( + cache_name="my_custom_cache", + sub_dir="Airbyte/custom_cache", + drive_name="My Company Drive", + ) + ``` + """ + try: + from google.colab import drive # noqa: PLC0415 # type: ignore[reportMissingImports] + except ImportError: + drive = None + msg = ( + "The `google.colab` interface is only available in Google Colab. " + "Please run this code in a Google Colab notebook." + ) + raise ImportError(msg) from None + + drive.mount(mount_path) + print(f"Successfully mounted Google Drive at `{mount_path}`.") + drive_root = ( + Path(mount_path) / drive_name + if drive_name == _MY_DRIVE + else Path(mount_path) / "Shareddrives" / drive_name + ) + + cache_dir = drive_root / sub_dir + cache_dir.mkdir(parents=True, exist_ok=True) + db_file_path = cache_dir / f"{cache_name}.duckdb" + + print(f"Creating persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") + return DuckDBCache( + db_path=db_file_path, + cache_dir=cache_dir, + schema_name=schema_name, + table_prefix=table_prefix, + ) From 5b3a8eb6378e4409fbcae39bf6adff262c458a98 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 14 Sep 2024 14:02:04 -0700 Subject: [PATCH 3/7] remove redundant print --- airbyte/caches/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index 73ec509c..392cf6c4 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -143,7 +143,6 @@ def get_colab_cache( raise ImportError(msg) from None drive.mount(mount_path) - print(f"Successfully mounted Google Drive at `{mount_path}`.") drive_root = ( Path(mount_path) / drive_name if drive_name == _MY_DRIVE From 0a7ebe259baba41a2682c7914fd6412410118870 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 14 Sep 2024 14:35:01 -0700 Subject: [PATCH 4/7] improved wording of print --- airbyte/caches/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/caches/util.py b/airbyte/caches/util.py index 392cf6c4..1d192406 100644 --- a/airbyte/caches/util.py +++ b/airbyte/caches/util.py @@ -153,7 +153,7 @@ def get_colab_cache( cache_dir.mkdir(parents=True, exist_ok=True) db_file_path = cache_dir / f"{cache_name}.duckdb" - print(f"Creating persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") + print(f"Using persistent PyAirbyte cache in Google Drive: `{db_file_path}`.") return DuckDBCache( db_path=db_file_path, cache_dir=cache_dir, From 6a4c48735ce057d5bded05dd73f7cbc36268d8ee Mon Sep 17 00:00:00 2001 From: octavia-squidington-iii Date: Sat, 14 Sep 2024 21:38:20 +0000 Subject: [PATCH 5/7] Auto-fix lint issues (unsafe) --- airbyte/caches/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 2b020dfe..bed4fe05 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -3,7 +3,6 @@ from __future__ import annotations -from pathlib import Path from typing import IO, TYPE_CHECKING, Any, final import pandas as pd @@ -29,6 +28,7 @@ if TYPE_CHECKING: from collections.abc import Iterator + from pathlib import Path from airbyte._message_iterators import AirbyteMessageIterator from airbyte.caches._state_backend_base import StateBackendBase From 0d9e503e0c08187d476bd19182a3d3f8d79a8578 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Sat, 14 Sep 2024 14:41:48 -0700 Subject: [PATCH 6/7] Update airbyte/caches/base.py --- airbyte/caches/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index bed4fe05..714b7f2e 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -51,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface): to the SQL backend specified in the `SqlConfig` class. """ - cache_dir: Path = Field(default_factory=lambda: constants.DEFAULT_CACHE_ROOT) + cache_dir: Path = Field(default=constants.DEFAULT_CACHE_ROOT) """The directory to store the cache in.""" cleanup: bool = TEMP_FILE_CLEANUP From 8457eb58a2ca4092f0288f37f46907267f131862 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Sat, 14 Sep 2024 20:01:46 -0700 Subject: [PATCH 7/7] Update base.py --- airbyte/caches/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte/caches/base.py b/airbyte/caches/base.py index 714b7f2e..ce917358 100644 --- a/airbyte/caches/base.py +++ b/airbyte/caches/base.py @@ -3,6 +3,7 @@ from __future__ import annotations +from pathlib import Path from typing import IO, TYPE_CHECKING, Any, final import pandas as pd @@ -28,7 +29,6 @@ if TYPE_CHECKING: from collections.abc import Iterator - from pathlib import Path from airbyte._message_iterators import AirbyteMessageIterator from airbyte.caches._state_backend_base import StateBackendBase @@ -51,7 +51,7 @@ class CacheBase(SqlConfig, AirbyteWriterInterface): to the SQL backend specified in the `SqlConfig` class. """ - cache_dir: Path = Field(default=constants.DEFAULT_CACHE_ROOT) + cache_dir: Path = Field(default=Path(constants.DEFAULT_CACHE_ROOT)) """The directory to store the cache in.""" cleanup: bool = TEMP_FILE_CLEANUP