Skip to content

Commit

Permalink
rename complete
Browse files Browse the repository at this point in the history
  • Loading branch information
potter-potter committed Aug 1, 2024
1 parent 8fd216c commit 7bd61fc
Show file tree
Hide file tree
Showing 22 changed files with 109 additions and 109 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ include requirements/huggingface.in

# Ingest extras
include requirements/ingest/airtable.in
include requirements/ingest/astra.in
include requirements/ingest/astradb.in
include requirements/ingest/azure-cognitive-search.in
include requirements/ingest/azure.in
include requirements/ingest/biomed.in
Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,9 @@ install-ingest-mongodb:
install-ingest-databricks-volumes:
python3 -m pip install -r requirements/ingest/databricks-volumes.txt

.PHONY: install-ingest-astra
install-ingest-astra:
python3 -m pip install -r requirements/ingest/astra.txt
.PHONY: install-ingest-astradb
install-ingest-astradb:
python3 -m pip install -r requirements/ingest/astradb.txt

.PHONY: install-ingest-clarifai
install-ingest-clarifai:
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
# This file is autogenerated by pip-compile with Python 3.9
# by the following command:
#
# pip-compile ./ingest/astra.in
# pip-compile ./ingest/astradb.in
#
anyio==3.7.1
# via
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# httpx
astrapy==1.4.0
# via -r ./ingest/astra.in
# via -r ./ingest/astradb.in
bson==0.5.10
# via astrapy
cassandra-driver==3.29.1
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
"xlsx": xlsx_reqs,
# Extra requirements for data connectors
"airtable": load_requirements("requirements/ingest/airtable.in"),
"astra": load_requirements("requirements/ingest/astra.in"),
"astradb": load_requirements("requirements/ingest/astradb.in"),
"azure": load_requirements("requirements/ingest/azure.in"),
"azure-cognitive-search": load_requirements(
"requirements/ingest/azure-cognitive-search.in",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=astra-dest
OUTPUT_FOLDER_NAME=astradb-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
Expand All @@ -21,7 +21,7 @@ if [ -z "$ASTRA_DB_API_ENDPOINT" ]; then
fi

RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
COLLECTION_NAME="astra_test_output_$RANDOM_SUFFIX"
COLLECTION_NAME="astradb_test_output_$RANDOM_SUFFIX"
EMBEDDING_DIMENSION=384

# shellcheck disable=SC1091
Expand All @@ -31,7 +31,7 @@ function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"

python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \
python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
--token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" down
Expand All @@ -51,14 +51,14 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--chunk-max-characters 1500 \
--chunk-multipage-sections \
--embedding-provider "langchain-huggingface" \
astra \
astradb \
--token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \
--embedding-dimension "$EMBEDDING_DIMENSION" \
--requested-indexing-policy '{"deny": ["metadata"]}'

python "$SCRIPT_DIR"/python/test-ingest-astra-output.py \
python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \
--token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" check
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def get_client(token, api_endpoint, collection_name) -> AstraDB:
return astra_db, astra_db_collection


@click.group(name="astra-ingest")
@click.group(name="astradb-ingest")
@click.option("--token", type=str)
@click.option("--api-endpoint", type=str)
@click.option("--collection-name", type=str, default="collection_test")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=astra
OUTPUT_FOLDER_NAME=astradb
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
Expand All @@ -23,7 +23,7 @@ fi
COLLECTION_NAME="ingest_test_src"

PYTHONPATH=. ./unstructured/ingest/main.py \
astra \
astradb \
--token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured_ingest/test-ingest-dest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ cd "$SCRIPT_DIR"/.. || exit 1
export OMP_THREAD_LIMIT=1

all_tests=(
'astra.sh'
'astradb.sh'
'azure.sh'
'azure-cognitive-search.sh'
'box.sh'
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured_ingest/test-ingest-src.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export OMP_THREAD_LIMIT=1
all_tests=(
's3.sh'
's3-minio.sh'
'astra.sh'
'astradb.sh'
'azure.sh'
'biomed-api.sh'
'biomed-path.sh'
Expand Down
8 changes: 4 additions & 4 deletions unstructured/ingest/cli/cmds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd

from .airtable import get_base_src_cmd as airtable_base_src_cmd
from .astra import get_base_dest_cmd as astra_base_dest_cmd
from .astra import get_base_src_cmd as astra_base_src_cmd
from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
from .astradb import get_base_src_cmd as astradb_base_src_cmd
from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
from .biomed import get_base_src_cmd as biomed_base_src_cmd
from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
Expand Down Expand Up @@ -63,7 +63,7 @@

base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
airtable_base_src_cmd,
astra_base_src_cmd,
astradb_base_src_cmd,
azure_base_src_cmd,
biomed_base_src_cmd,
box_base_src_cmd,
Expand Down Expand Up @@ -106,7 +106,7 @@
)

base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
astra_base_dest_cmd,
astradb_base_dest_cmd,
azure_base_dest_cmd,
box_base_dest_cmd,
chroma_base_dest_cmd,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import click

from unstructured.ingest.cli.interfaces import CliConfig, Dict
from unstructured.ingest.connector.astra import AstraWriteConfig, SimpleAstraConfig
from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig


@dataclass
class AstraCliConfig(SimpleAstraConfig, CliConfig):
class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig):
@staticmethod
def get_cli_options() -> t.List[click.Option]:
options = [
Expand Down Expand Up @@ -48,7 +48,7 @@ def get_cli_options() -> t.List[click.Option]:


@dataclass
class AstraCliWriteConfig(AstraWriteConfig, CliConfig):
class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig):
@staticmethod
def get_cli_options() -> t.List[click.Option]:
options = [
Expand Down Expand Up @@ -81,8 +81,8 @@ def get_base_src_cmd():
from unstructured.ingest.cli.base.src import BaseSrcCmd

cmd_cls = BaseSrcCmd(
cmd_name="astra",
cli_config=AstraCliConfig,
cmd_name="astradb",
cli_config=AstraDBCliConfig,
)
return cmd_cls

Expand All @@ -91,9 +91,9 @@ def get_base_dest_cmd():
from unstructured.ingest.cli.base.dest import BaseDestCmd

cmd_cls = BaseDestCmd(
cmd_name="astra",
cli_config=AstraCliConfig,
additional_cli_options=[AstraCliWriteConfig],
write_config=AstraWriteConfig,
cmd_name="astradb",
cli_config=AstraDBCliConfig,
additional_cli_options=[AstraDBCliWriteConfig],
write_config=AstraDBWriteConfig,
)
return cmd_cls
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,23 @@


@dataclass
class AstraAccessConfig(AccessConfig):
class AstraDBAccessConfig(AccessConfig):
token: str = enhanced_field(sensitive=True)
api_endpoint: str = enhanced_field(sensitive=True)


@dataclass
class SimpleAstraConfig(BaseConnectorConfig):
access_config: AstraAccessConfig
class SimpleAstraDBConfig(BaseConnectorConfig):
access_config: AstraDBAccessConfig
collection_name: str
namespace: t.Optional[str] = None


@dataclass
class AstraIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
connector_config: SimpleAstraConfig
class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc):
connector_config: SimpleAstraDBConfig
metadata: t.Dict[str, str] = field(default_factory=dict)
registry_name: str = "astra"
registry_name: str = "astradb"

@property
def filename(self):
Expand Down Expand Up @@ -76,7 +76,7 @@ def update_source_metadata(self, **kwargs):
)

@SourceConnectionError.wrap
@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
@BaseSingleIngestDoc.skip_if_file_exists
def get_file(self):
self.filename.parent.mkdir(parents=True, exist_ok=True)
Expand All @@ -90,19 +90,19 @@ def get_file(self):


@dataclass
class AstraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
connector_config: SimpleAstraConfig
class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
connector_config: SimpleAstraDBConfig
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)

@property
@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None:
from astrapy.db import AstraDB

# Build the Astra DB object.
# caller_name/version for AstraDB tracking
# caller_name/version for Astra DB tracking
self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token,
Expand All @@ -117,27 +117,27 @@ def astra_db_collection(self) -> "AstraDBCollection":
)
return self._astra_db_collection # type: ignore

@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
@SourceConnectionError.wrap # type: ignore
def initialize(self):
_ = self.astra_db_collection

@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
def check_connection(self):
try:
_ = self.astra_db_collection
except Exception as e:
logger.error(f"Failed to validate connection {e}", exc_info=True)
raise SourceConnectionError(f"failed to validate connection: {e}")

@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
def get_ingest_docs(self): # type: ignore
# Perform the find operation
astra_docs = list(self.astra_db_collection.paginated_find())

doc_list = []
for record in astra_docs:
doc = AstraIngestDoc(
doc = AstraDBIngestDoc(
connector_config=self.connector_config,
processor_config=self.processor_config,
read_config=self.read_config,
Expand All @@ -152,16 +152,16 @@ def get_ingest_docs(self): # type: ignore


@dataclass
class AstraWriteConfig(WriteConfig):
class AstraDBWriteConfig(WriteConfig):
embedding_dimension: int
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None
batch_size: int = 20


@dataclass
class AstraDestinationConnector(BaseDestinationConnector):
write_config: AstraWriteConfig
connector_config: SimpleAstraConfig
class AstraDBDestinationConnector(BaseDestinationConnector):
write_config: AstraDBWriteConfig
connector_config: SimpleAstraDBConfig
_astra_db: t.Optional["AstraDB"] = field(init=False, default=None)
_astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None)

Expand All @@ -180,19 +180,19 @@ def to_dict(self, **kwargs):
return _asdict(self_cp, **kwargs)

@property
@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None:
from astrapy.db import AstraDB

collection_name = self.connector_config.collection_name
embedding_dimension = self.write_config.embedding_dimension

# If the user has requested an indexing policy, pass it to the AstraDB
# If the user has requested an indexing policy, pass it to the Astra DB
requested_indexing_policy = self.write_config.requested_indexing_policy
options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None

# caller_name/version for AstraDB tracking
# caller_name/version for Astra DB tracking
self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token,
Expand All @@ -209,12 +209,12 @@ def astra_db_collection(self) -> "AstraDBCollection":
)
return self._astra_db_collection

@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
@DestinationConnectionError.wrap
def initialize(self):
_ = self.astra_db_collection

@requires_dependencies(["astrapy"], extras="astra")
@requires_dependencies(["astrapy"], extras="astradb")
def check_connection(self):
try:
_ = self.astra_db_collection
Expand All @@ -223,7 +223,7 @@ def check_connection(self):
raise DestinationConnectionError(f"failed to validate connection: {e}")

def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra.")
logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.")

astra_batch_size = self.write_config.batch_size

Expand Down
4 changes: 2 additions & 2 deletions unstructured/ingest/connector/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, Type, cast

from unstructured.ingest.connector.airtable import AirtableIngestDoc
from unstructured.ingest.connector.astra import AstraIngestDoc
from unstructured.ingest.connector.astradb import AstraDBIngestDoc
from unstructured.ingest.connector.biomed import BiomedIngestDoc
from unstructured.ingest.connector.confluence import ConfluenceIngestDoc
from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc
Expand Down Expand Up @@ -46,7 +46,7 @@

INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
"airtable": AirtableIngestDoc,
"astra": AstraIngestDoc,
"astradb": AstraDBIngestDoc,
"azure": AzureBlobStorageIngestDoc,
"biomed": BiomedIngestDoc,
"box": BoxIngestDoc,
Expand Down
Loading

0 comments on commit 7bd61fc

Please sign in to comment.