diff --git a/MANIFEST.in b/MANIFEST.in index 356b719f3b..3c2322f084 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,7 +17,7 @@ include requirements/huggingface.in # Ingest extras include requirements/ingest/airtable.in -include requirements/ingest/astra.in +include requirements/ingest/astradb.in include requirements/ingest/azure-cognitive-search.in include requirements/ingest/azure.in include requirements/ingest/biomed.in diff --git a/Makefile b/Makefile index c1d02a0c66..58583fd563 100644 --- a/Makefile +++ b/Makefile @@ -253,9 +253,9 @@ install-ingest-mongodb: install-ingest-databricks-volumes: python3 -m pip install -r requirements/ingest/databricks-volumes.txt -.PHONY: install-ingest-astra -install-ingest-astra: - python3 -m pip install -r requirements/ingest/astra.txt +.PHONY: install-ingest-astradb +install-ingest-astradb: + python3 -m pip install -r requirements/ingest/astradb.txt .PHONY: install-ingest-clarifai install-ingest-clarifai: diff --git a/requirements/ingest/astra.in b/requirements/ingest/astradb.in similarity index 100% rename from requirements/ingest/astra.in rename to requirements/ingest/astradb.in diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astradb.txt similarity index 96% rename from requirements/ingest/astra.txt rename to requirements/ingest/astradb.txt index 746f091266..24d10b1432 100644 --- a/requirements/ingest/astra.txt +++ b/requirements/ingest/astradb.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile ./ingest/astra.in +# pip-compile ./ingest/astradb.in # anyio==3.7.1 # via @@ -10,7 +10,7 @@ anyio==3.7.1 # -c ./ingest/../deps/constraints.txt # httpx astrapy==1.4.0 - # via -r ./ingest/astra.in + # via -r ./ingest/astradb.in bson==0.5.10 # via astrapy cassandra-driver==3.29.1 diff --git a/setup.py b/setup.py index 717f477be6..b5741cdf72 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List "xlsx": xlsx_reqs, # Extra requirements for data connectors "airtable": load_requirements("requirements/ingest/airtable.in"), - "astra": load_requirements("requirements/ingest/astra.in"), + "astradb": load_requirements("requirements/ingest/astradb.in"), "azure": load_requirements("requirements/ingest/azure.in"), "azure-cognitive-search": load_requirements( "requirements/ingest/azure-cognitive-search.in", diff --git a/test_unstructured_ingest/dest/astra.sh b/test_unstructured_ingest/dest/astradb.sh similarity index 88% rename from test_unstructured_ingest/dest/astra.sh rename to test_unstructured_ingest/dest/astradb.sh index dee4fe1bfa..77fc0e25ef 100755 --- a/test_unstructured_ingest/dest/astra.sh +++ b/test_unstructured_ingest/dest/astradb.sh @@ -5,7 +5,7 @@ set -e SRC_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$SRC_PATH") cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=astra-dest +OUTPUT_FOLDER_NAME=astradb-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} @@ -21,7 +21,7 @@ if [ -z "$ASTRA_DB_API_ENDPOINT" ]; then fi RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) -COLLECTION_NAME="astra_test_output_$RANDOM_SUFFIX" +COLLECTION_NAME="astradb_test_output_$RANDOM_SUFFIX" EMBEDDING_DIMENSION=384 # shellcheck disable=SC1091 @@ -31,7 +31,7 @@ function cleanup() { cleanup_dir "$OUTPUT_DIR" cleanup_dir "$WORK_DIR" - python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \ + python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --collection-name "$COLLECTION_NAME" down @@ -51,14 +51,14 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --chunk-max-characters 1500 \ --chunk-multipage-sections \ --embedding-provider "langchain-huggingface" \ - astra \ + astradb \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --collection-name "$COLLECTION_NAME" \ --embedding-dimension "$EMBEDDING_DIMENSION" \ --requested-indexing-policy '{"deny": ["metadata"]}' -python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \ +python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --collection-name "$COLLECTION_NAME" check diff --git a/test_unstructured_ingest/python/test-ingest-astra-output.py b/test_unstructured_ingest/python/test-ingest-astradb-output.py similarity index 98% rename from test_unstructured_ingest/python/test-ingest-astra-output.py rename to test_unstructured_ingest/python/test-ingest-astradb-output.py index 661ec92a80..137745e6ab 100755 --- a/test_unstructured_ingest/python/test-ingest-astra-output.py +++ b/test_unstructured_ingest/python/test-ingest-astradb-output.py @@ -10,7 +10,7 @@ def get_client(token, api_endpoint, collection_name) -> AstraDB: return astra_db, astra_db_collection -@click.group(name="astra-ingest") +@click.group(name="astradb-ingest") @click.option("--token", type=str) @click.option("--api-endpoint", type=str) @click.option("--collection-name", type=str, default="collection_test") diff --git a/test_unstructured_ingest/src/astra.sh b/test_unstructured_ingest/src/astradb.sh similarity index 97% rename from test_unstructured_ingest/src/astra.sh rename to test_unstructured_ingest/src/astradb.sh index a1dbc346ff..9aa89c48fa 100755 --- a/test_unstructured_ingest/src/astra.sh +++ b/test_unstructured_ingest/src/astradb.sh @@ -5,7 +5,7 @@ set -e SRC_PATH=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$SRC_PATH") cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=astra +OUTPUT_FOLDER_NAME=astradb OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME @@ -23,7 +23,7 @@ fi COLLECTION_NAME="ingest_test_src" PYTHONPATH=. ./unstructured/ingest/main.py \ - astra \ + astradb \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --collection-name "$COLLECTION_NAME" \ diff --git a/test_unstructured_ingest/test-ingest-dest.sh b/test_unstructured_ingest/test-ingest-dest.sh index ab6634cc0e..ab1e819735 100755 --- a/test_unstructured_ingest/test-ingest-dest.sh +++ b/test_unstructured_ingest/test-ingest-dest.sh @@ -15,7 +15,7 @@ cd "$SCRIPT_DIR"/.. || exit 1 export OMP_THREAD_LIMIT=1 all_tests=( - 'astra.sh' + 'astradb.sh' 'azure.sh' 'azure-cognitive-search.sh' 'box.sh' diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 250b0e9ad6..9b4a7fc938 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -19,7 +19,7 @@ export OMP_THREAD_LIMIT=1 all_tests=( 's3.sh' 's3-minio.sh' - 'astra.sh' + 'astradb.sh' 'azure.sh' 'biomed-api.sh' 'biomed-path.sh' diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py index 9384e20067..f75ee797e6 100644 --- a/unstructured/ingest/cli/cmds/__init__.py +++ b/unstructured/ingest/cli/cmds/__init__.py @@ -7,8 +7,8 @@ from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd from .airtable import get_base_src_cmd as airtable_base_src_cmd -from .astra import get_base_dest_cmd as astra_base_dest_cmd -from .astra import get_base_src_cmd as astra_base_src_cmd +from .astradb import get_base_dest_cmd as astradb_base_dest_cmd +from .astradb import get_base_src_cmd as astradb_base_src_cmd from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd from .biomed import get_base_src_cmd as biomed_base_src_cmd from .chroma import get_base_dest_cmd as chroma_base_dest_cmd @@ -63,7 +63,7 @@ base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [ airtable_base_src_cmd, - astra_base_src_cmd, + astradb_base_src_cmd, azure_base_src_cmd, biomed_base_src_cmd, box_base_src_cmd, @@ -106,7 +106,7 @@ ) base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ - astra_base_dest_cmd, + astradb_base_dest_cmd, azure_base_dest_cmd, box_base_dest_cmd, chroma_base_dest_cmd, diff --git a/unstructured/ingest/cli/cmds/astra.py b/unstructured/ingest/cli/cmds/astradb.py similarity index 85% rename from unstructured/ingest/cli/cmds/astra.py rename to unstructured/ingest/cli/cmds/astradb.py index 5729b1f643..b7be8f56cc 100644 --- a/unstructured/ingest/cli/cmds/astra.py +++ b/unstructured/ingest/cli/cmds/astradb.py @@ -4,11 +4,11 @@ import click from unstructured.ingest.cli.interfaces import CliConfig, Dict -from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig +from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig @dataclass -class AstraCliConfig(SimpleAstraConfig, CliConfig): +class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig): @staticmethod def get_cli_options() -> t.List[click.Option]: options = [ @@ -48,7 +48,7 @@ def get_cli_options() -> t.List[click.Option]: @dataclass -class AstraCliWriteConfig(AstraWriteConfig, CliConfig): +class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig): @staticmethod def get_cli_options() -> t.List[click.Option]: options = [ @@ -81,8 +81,8 @@ def get_base_src_cmd(): from unstructured.ingest.cli.base.src import BaseSrcCmd cmd_cls = BaseSrcCmd( - cmd_name="astra", - cli_config=AstraCliConfig, + cmd_name="astradb", + cli_config=AstraDBCliConfig, ) return cmd_cls @@ -91,9 +91,9 @@ def get_base_dest_cmd(): from unstructured.ingest.cli.base.dest import BaseDestCmd cmd_cls = BaseDestCmd( - cmd_name="astra", - cli_config=AstraCliConfig, - additional_cli_options=[AstraCliWriteConfig], - write_config=AstraWriteConfig, + cmd_name="astradb", + cli_config=AstraDBCliConfig, + additional_cli_options=[AstraDBCliWriteConfig], + write_config=AstraDBWriteConfig, ) return cmd_cls diff --git a/unstructured/ingest/connector/astra.py b/unstructured/ingest/connector/astradb.py similarity index 85% rename from unstructured/ingest/connector/astra.py rename to unstructured/ingest/connector/astradb.py index baf0cd836f..2642ea1913 100644 --- a/unstructured/ingest/connector/astra.py +++ b/unstructured/ingest/connector/astradb.py @@ -31,23 +31,23 @@ @dataclass -class AstraAccessConfig(AccessConfig): +class AstraDBAccessConfig(AccessConfig): token: str = enhanced_field(sensitive=True) api_endpoint: str = enhanced_field(sensitive=True) @dataclass -class SimpleAstraConfig(BaseConnectorConfig): - access_config: AstraAccessConfig +class SimpleAstraDBConfig(BaseConnectorConfig): + access_config: AstraDBAccessConfig collection_name: str namespace: t.Optional[str] = None @dataclass -class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleAstraConfig +class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): + connector_config: SimpleAstraDBConfig metadata: t.Dict[str, str] = field(default_factory=dict) - registry_name: str = "astra" + registry_name: str = "astradb" @property def filename(self): @@ -76,7 +76,7 @@ def update_source_metadata(self, **kwargs): ) @SourceConnectionError.wrap - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") @BaseSingleIngestDoc.skip_if_file_exists def get_file(self): self.filename.parent.mkdir(parents=True, exist_ok=True) @@ -90,19 +90,19 @@ def get_file(self): @dataclass -class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleAstraConfig +class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): + connector_config: SimpleAstraDBConfig _astra_db: t.Optional["AstraDB"] = field(init=False, default=None) _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) @property - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def astra_db_collection(self) -> "AstraDBCollection": if self._astra_db_collection is None: from astrapy.db import AstraDB # Build the Astra DB object. - # caller_name/version for AstraDB tracking + # caller_name/version for Astra DB tracking self._astra_db = AstraDB( api_endpoint=self.connector_config.access_config.api_endpoint, token=self.connector_config.access_config.token, @@ -117,12 +117,12 @@ def astra_db_collection(self) -> "AstraDBCollection": ) return self._astra_db_collection # type: ignore - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") @SourceConnectionError.wrap # type: ignore def initialize(self): _ = self.astra_db_collection - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def check_connection(self): try: _ = self.astra_db_collection @@ -130,14 +130,14 @@ def check_connection(self): logger.error(f"Failed to validate connection {e}", exc_info=True) raise SourceConnectionError(f"failed to validate connection: {e}") - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def get_ingest_docs(self): # type: ignore # Perform the find operation astra_docs = list(self.astra_db_collection.paginated_find()) doc_list = [] for record in astra_docs: - doc = AstraIngestDoc( + doc = AstraDBIngestDoc( connector_config=self.connector_config, processor_config=self.processor_config, read_config=self.read_config, @@ -152,16 +152,16 @@ def get_ingest_docs(self): # type: ignore @dataclass -class AstraWriteConfig(WriteConfig): +class AstraDBWriteConfig(WriteConfig): embedding_dimension: int requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None batch_size: int = 20 @dataclass -class AstraDestinationConnector(BaseDestinationConnector): - write_config: AstraWriteConfig - connector_config: SimpleAstraConfig +class AstraDBDestinationConnector(BaseDestinationConnector): + write_config: AstraDBWriteConfig + connector_config: SimpleAstraDBConfig _astra_db: t.Optional["AstraDB"] = field(init=False, default=None) _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) @@ -180,7 +180,7 @@ def to_dict(self, **kwargs): return _asdict(self_cp, **kwargs) @property - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def astra_db_collection(self) -> "AstraDBCollection": if self._astra_db_collection is None: from astrapy.db import AstraDB @@ -188,11 +188,11 @@ def astra_db_collection(self) -> "AstraDBCollection": collection_name = self.connector_config.collection_name embedding_dimension = self.write_config.embedding_dimension - # If the user has requested an indexing policy, pass it to the AstraDB + # If the user has requested an indexing policy, pass it to the Astra DB requested_indexing_policy = self.write_config.requested_indexing_policy options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None - # caller_name/version for AstraDB tracking + # caller_name/version for Astra DB tracking self._astra_db = AstraDB( api_endpoint=self.connector_config.access_config.api_endpoint, token=self.connector_config.access_config.token, @@ -209,12 +209,12 @@ def astra_db_collection(self) -> "AstraDBCollection": ) return self._astra_db_collection - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") @DestinationConnectionError.wrap def initialize(self): _ = self.astra_db_collection - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def check_connection(self): try: _ = self.astra_db_collection @@ -223,7 +223,7 @@ def check_connection(self): raise DestinationConnectionError(f"failed to validate connection: {e}") def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.") + logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.") astra_batch_size = self.write_config.batch_size diff --git a/unstructured/ingest/connector/registry.py b/unstructured/ingest/connector/registry.py index 21f4e53765..35250d6f05 100644 --- a/unstructured/ingest/connector/registry.py +++ b/unstructured/ingest/connector/registry.py @@ -2,7 +2,7 @@ from typing import Dict, Type, cast from unstructured.ingest.connector.airtable import AirtableIngestDoc -from unstructured.ingest.connector.astra import AstraIngestDoc +from unstructured.ingest.connector.astradb import AstraDBIngestDoc from unstructured.ingest.connector.biomed import BiomedIngestDoc from unstructured.ingest.connector.confluence import ConfluenceIngestDoc from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc @@ -46,7 +46,7 @@ INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = { "airtable": AirtableIngestDoc, - "astra": AstraIngestDoc, + "astradb": AstraDBIngestDoc, "azure": AzureBlobStorageIngestDoc, "biomed": BiomedIngestDoc, "box": BoxIngestDoc, diff --git a/unstructured/ingest/runner/__init__.py b/unstructured/ingest/runner/__init__.py index f4ef3629eb..872ebb10d5 100644 --- a/unstructured/ingest/runner/__init__.py +++ b/unstructured/ingest/runner/__init__.py @@ -2,7 +2,7 @@ from typing import Type from .airtable import AirtableRunner -from .astra import AstraRunner +from .astradb import AstraDBRunner from .base_runner import Runner from .biomed import BiomedRunner from .confluence import ConfluenceRunner @@ -36,7 +36,7 @@ runner_map: t.Dict[str, Type[Runner]] = { "airtable": AirtableRunner, - "astra": AstraRunner, + "astradb": AstraDBRunner, "azure": AzureRunner, "biomed": BiomedRunner, "box": BoxRunner, diff --git a/unstructured/ingest/runner/astra.py b/unstructured/ingest/runner/astradb.py similarity index 71% rename from unstructured/ingest/runner/astra.py rename to unstructured/ingest/runner/astradb.py index e5afc9cb6a..a07c66b93e 100644 --- a/unstructured/ingest/runner/astra.py +++ b/unstructured/ingest/runner/astradb.py @@ -8,27 +8,27 @@ from unstructured.ingest.runner.utils import update_download_dir_hash if t.TYPE_CHECKING: - from unstructured.ingest.connector.astra import SimpleAstraConfig + from unstructured.ingest.connector.astradb import SimpleAstraDBConfig @dataclass -class AstraRunner(Runner): - connector_config: "SimpleAstraConfig" +class AstraDBRunner(Runner): + connector_config: "SimpleAstraDBConfig" def update_read_config(self): hashed_dir_name = hashlib.sha256( str(self.connector_config.access_config.api_endpoint).encode("utf-8"), ) self.read_config.download_dir = update_download_dir_hash( - connector_name="astra", + connector_name="astradb", read_config=self.read_config, hashed_dir_name=hashed_dir_name, logger=logger, ) def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.astra import ( - AstraSourceConnector, + from unstructured.ingest.connector.astradb import ( + AstraDBSourceConnector, ) - return AstraSourceConnector + return AstraDBSourceConnector diff --git a/unstructured/ingest/runner/writers/__init__.py b/unstructured/ingest/runner/writers/__init__.py index 130015cf47..8b07adb9ed 100644 --- a/unstructured/ingest/runner/writers/__init__.py +++ b/unstructured/ingest/runner/writers/__init__.py @@ -1,6 +1,6 @@ import typing as t -from .astra import AstraWriter +from .astradb import AstraDBWriter from .azure_cognitive_search import AzureCognitiveSearchWriter from .base_writer import Writer from .chroma import ChromaWriter @@ -23,7 +23,7 @@ from .weaviate import WeaviateWriter writer_map: t.Dict[str, t.Type[Writer]] = { - "astra": AstraWriter, + "astradb": AstraDBWriter, "azure": AzureWriter, "azure_cognitive_search": AzureCognitiveSearchWriter, "box": BoxWriter, diff --git a/unstructured/ingest/runner/writers/astra.py b/unstructured/ingest/runner/writers/astradb.py similarity index 50% rename from unstructured/ingest/runner/writers/astra.py rename to unstructured/ingest/runner/writers/astradb.py index eb92d674f8..b12ee7234e 100644 --- a/unstructured/ingest/runner/writers/astra.py +++ b/unstructured/ingest/runner/writers/astradb.py @@ -6,17 +6,17 @@ from unstructured.ingest.runner.writers.base_writer import Writer if t.TYPE_CHECKING: - from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig + from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig @dataclass -class AstraWriter(Writer, EnhancedDataClassJsonMixin): - write_config: "AstraWriteConfig" - connector_config: "SimpleAstraConfig" +class AstraDBWriter(Writer, EnhancedDataClassJsonMixin): + write_config: "AstraDBWriteConfig" + connector_config: "SimpleAstraDBConfig" def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.astra import ( - AstraDestinationConnector, + from unstructured.ingest.connector.astradb import ( + AstraDBDestinationConnector, ) - return AstraDestinationConnector + return AstraDBDestinationConnector diff --git a/unstructured/ingest/v2/cli/cmds/__init__.py b/unstructured/ingest/v2/cli/cmds/__init__.py index 81563f9722..4a4a74c5d8 100644 --- a/unstructured/ingest/v2/cli/cmds/__init__.py +++ b/unstructured/ingest/v2/cli/cmds/__init__.py @@ -2,7 +2,7 @@ import click -from .astra import astra_dest_cmd +from .astradb import astradb_dest_cmd from .azure_cognitive_search import azure_cognitive_search_dest_cmd from .chroma import chroma_dest_cmd from .databricks_volumes import databricks_volumes_dest_cmd @@ -51,7 +51,7 @@ ) dest_cmds = [ - astra_dest_cmd, + astradb_dest_cmd, azure_cognitive_search_dest_cmd, azure_dest_cmd, box_dest_cmd, diff --git a/unstructured/ingest/v2/cli/cmds/astra.py b/unstructured/ingest/v2/cli/cmds/astradb.py similarity index 85% rename from unstructured/ingest/v2/cli/cmds/astra.py rename to unstructured/ingest/v2/cli/cmds/astradb.py index 5e24fcf772..36de30f703 100644 --- a/unstructured/ingest/v2/cli/cmds/astra.py +++ b/unstructured/ingest/v2/cli/cmds/astradb.py @@ -5,11 +5,11 @@ from unstructured.ingest.v2.cli.base import DestCmd from unstructured.ingest.v2.cli.interfaces import CliConfig from unstructured.ingest.v2.cli.utils import Dict -from unstructured.ingest.v2.processes.connectors.astra import CONNECTOR_TYPE +from unstructured.ingest.v2.processes.connectors.astradb import CONNECTOR_TYPE @dataclass -class AstraCliConnectionConfig(CliConfig): +class AstraDBCliConnectionConfig(CliConfig): @staticmethod def get_cli_options() -> list[click.Option]: options = [ @@ -18,7 +18,7 @@ def get_cli_options() -> list[click.Option]: required=True, type=str, help="Astra DB Token with access to the database.", - envvar="ASTRA_DB_TOKEN", + envvar="ASTRA_DB_APPLICATION_TOKEN", show_envvar=True, ), click.Option( @@ -26,7 +26,7 @@ def get_cli_options() -> list[click.Option]: required=True, type=str, help="The API endpoint for the Astra DB.", - envvar="ASTRA_DB_ENDPOINT", + envvar="ASTRA_DB_API_ENDPOINT", show_envvar=True, ), ] @@ -34,7 +34,7 @@ def get_cli_options() -> list[click.Option]: @dataclass -class AstraCliUploaderConfig(CliConfig): +class AstraDBCliUploaderConfig(CliConfig): @staticmethod def get_cli_options() -> list[click.Option]: options = [ @@ -78,8 +78,8 @@ def get_cli_options() -> list[click.Option]: return options -astra_dest_cmd = DestCmd( +astradb_dest_cmd = DestCmd( cmd_name=CONNECTOR_TYPE, - connection_config=AstraCliConnectionConfig, - uploader_config=AstraCliUploaderConfig, + connection_config=AstraDBCliConnectionConfig, + uploader_config=AstraDBCliUploaderConfig, ) diff --git a/unstructured/ingest/v2/processes/connectors/__init__.py b/unstructured/ingest/v2/processes/connectors/__init__.py index 11a3d3be8d..5e4e2cf13d 100644 --- a/unstructured/ingest/v2/processes/connectors/__init__.py +++ b/unstructured/ingest/v2/processes/connectors/__init__.py @@ -6,8 +6,8 @@ add_source_entry, ) -from .astra import CONNECTOR_TYPE as ASTRA_CONNECTOR_TYPE -from .astra import astra_destination_entry +from .astradb import CONNECTOR_TYPE as ASTRADB_CONNECTOR_TYPE +from .astradb import astradb_destination_entry from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE from .chroma import chroma_destination_entry from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE @@ -37,7 +37,7 @@ from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE from .weaviate import weaviate_destination_entry -add_destination_entry(destination_type=ASTRA_CONNECTOR_TYPE, entry=astra_destination_entry) +add_destination_entry(destination_type=ASTRADB_CONNECTOR_TYPE, entry=astradb_destination_entry) add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry) diff --git a/unstructured/ingest/v2/processes/connectors/astra.py b/unstructured/ingest/v2/processes/connectors/astradb.py similarity index 80% rename from unstructured/ingest/v2/processes/connectors/astra.py rename to unstructured/ingest/v2/processes/connectors/astradb.py index dbd6204725..dc10862e8b 100644 --- a/unstructured/ingest/v2/processes/connectors/astra.py +++ b/unstructured/ingest/v2/processes/connectors/astradb.py @@ -26,30 +26,30 @@ if TYPE_CHECKING: from astrapy.db import AstraDBCollection -CONNECTOR_TYPE = "astra" +CONNECTOR_TYPE = "astradb" @dataclass -class AstraAccessConfig(AccessConfig): +class AstraDBAccessConfig(AccessConfig): token: str api_endpoint: str @dataclass -class AstraConnectionConfig(ConnectionConfig): +class AstraDBConnectionConfig(ConnectionConfig): connection_type: str = CONNECTOR_TYPE - access_config: AstraAccessConfig = enhanced_field(sensitive=True) + access_config: AstraDBAccessConfig = enhanced_field(sensitive=True) @dataclass -class AstraUploadStagerConfig(UploadStagerConfig): +class AstraDBUploadStagerConfig(UploadStagerConfig): pass @dataclass -class AstraUploadStager(UploadStager): - upload_stager_config: AstraUploadStagerConfig = field( - default_factory=lambda: AstraUploadStagerConfig() +class AstraDBUploadStager(UploadStager): + upload_stager_config: AstraDBUploadStagerConfig = field( + default_factory=lambda: AstraDBUploadStagerConfig() ) def conform_dict(self, element_dict: dict) -> dict: @@ -79,7 +79,7 @@ def run( @dataclass -class AstraUploaderConfig(UploaderConfig): +class AstraDBUploaderConfig(UploaderConfig): collection_name: str embedding_dimension: int namespace: Optional[str] = None @@ -88,12 +88,12 @@ class AstraUploaderConfig(UploaderConfig): @dataclass -class AstraUploader(Uploader): - connection_config: AstraConnectionConfig - upload_config: AstraUploaderConfig +class AstraDBUploader(Uploader): + connection_config: AstraDBConnectionConfig + upload_config: AstraDBUploaderConfig connector_type: str = CONNECTOR_TYPE - @requires_dependencies(["astrapy"], extras="astra") + @requires_dependencies(["astrapy"], extras="astradb") def get_collection(self) -> "AstraDBCollection": from astrapy.db import AstraDB @@ -102,11 +102,11 @@ def get_collection(self) -> "AstraDBCollection": embedding_dimension = self.upload_config.embedding_dimension requested_indexing_policy = self.upload_config.requested_indexing_policy - # If the user has requested an indexing policy, pass it to the AstraDB + # If the user has requested an indexing policy, pass it to the Astra DB options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None # Build the Astra DB object. - # caller_name/version for AstraDB tracking + # caller_name/version for Astra DB tracking astra_db = AstraDB( api_endpoint=self.connection_config.access_config.api_endpoint, token=self.connection_config.access_config.token, @@ -142,10 +142,10 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None: collection.insert_many(chunk) -astra_destination_entry = DestinationRegistryEntry( - connection_config=AstraConnectionConfig, - upload_stager_config=AstraUploadStagerConfig, - upload_stager=AstraUploadStager, - uploader_config=AstraUploaderConfig, - uploader=AstraUploader, +astradb_destination_entry = DestinationRegistryEntry( + connection_config=AstraDBConnectionConfig, + upload_stager_config=AstraDBUploadStagerConfig, + upload_stager=AstraDBUploadStager, + uploader_config=AstraDBUploaderConfig, + uploader=AstraDBUploader, )