diff --git a/Dockerfile b/Dockerfile index fc805e27..9578f4ce 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ USER root COPY . . RUN chmod +x init.sh USER airflow -RUN pip install -r requirements.txt +RUN pip install --no-cache-dir apache-airflow==2.9.1 -r requirements.txt FROM python:3.11-bullseye AS test WORKDIR /project diff --git a/dags/hivemind_etl_helpers/gdrive_ingestion_etl.py b/dags/hivemind_etl_helpers/gdrive_ingestion_etl.py deleted file mode 100644 index 4395804f..00000000 --- a/dags/hivemind_etl_helpers/gdrive_ingestion_etl.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging - -from hivemind_etl_helpers.src.utils.credentials import load_redis_credentials -from llama_index.core.ingestion import ( - DocstoreStrategy, - IngestionCache, - IngestionPipeline, -) -from llama_index.core.node_parser import SemanticSplitterNodeParser -from llama_index.storage.docstore.postgres import PostgresDocumentStore -from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache -from llama_index.vector_stores.postgres import PGVectorStore -from tc_hivemind_backend.db.credentials import load_postgres_credentials -from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams -from tc_hivemind_backend.embeddings.cohere import CohereEmbedding - - -class GoogleDriveIngestionPipeline: - def __init__(self, community_id: str): - self.community_id = community_id - - # Embedding models - self.cohere_model = CohereEmbedding() - _, self.embedding_dim = load_model_hyperparams() - self.pg_creds = load_postgres_credentials() - self.redis_cred = load_redis_credentials() - self.table_name = "gdrive" - self.dbname = f"community_{community_id}" - - # Database details - self.redis_host = self.redis_cred["host"] - self.redis_port = self.redis_cred["port"] - - def run_pipeline(self, docs): - pipeline = IngestionPipeline( - transformations=[ - SemanticSplitterNodeParser(embed_model=self.cohere_model), - self.cohere_model, - ], - docstore=PostgresDocumentStore.from_params( - host=self.pg_creds["host"], - port=self.pg_creds["port"], - database=self.dbname, - user=self.pg_creds["user"], - password=self.pg_creds["password"], - table_name=self.table_name + "_docstore", - ), - vector_store=PGVectorStore.from_params( - host=self.pg_creds["host"], - port=self.pg_creds["port"], - database=self.dbname, - user=self.pg_creds["user"], - password=self.pg_creds["password"], - table_name=self.table_name, - embed_dim=self.embedding_dim, - ), - cache=IngestionCache( - cache=RedisCache.from_host_and_port(self.redis_host, self.redis_port), - collection=self.dbname + f"_{self.table_name}" + "_ingestion_cache", - docstore_strategy=DocstoreStrategy.UPSERTS, - ), - docstore_strategy=DocstoreStrategy.UPSERTS, - ) - try: - nodes = pipeline.run(documents=docs, show_progress=True) - return nodes - except Exception as e: - logging.error( - f"An error occurred while running the pipeline: {e}", exc_info=True - ) diff --git a/dags/hivemind_etl_helpers/github_etl.py b/dags/hivemind_etl_helpers/github_etl.py index fb06b5d0..648e462a 100644 --- a/dags/hivemind_etl_helpers/github_etl.py +++ b/dags/hivemind_etl_helpers/github_etl.py @@ -2,6 +2,7 @@ from datetime import datetime from dotenv import load_dotenv +from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline from hivemind_etl_helpers.src.db.github.extract import ( GithubExtraction, fetch_issues, @@ -10,13 +11,8 @@ from hivemind_etl_helpers.src.db.github.github_organization_repos import ( get_github_organization_repos, ) -from hivemind_etl_helpers.src.db.github.load import ( - PrepareDeletion, - load_documents_into_pg_database, -) from hivemind_etl_helpers.src.db.github.transform import GitHubTransformation from llama_index.core import Document -from tc_hivemind_backend.db.pg_db_utils import setup_db def process_github_vectorstore( @@ -40,30 +36,8 @@ def process_github_vectorstore( the date to start processing data from """ load_dotenv() - dbname = f"community_{community_id}" prefix = f"COMMUNITYID: {community_id} " - logging.info(prefix) - - table_name = "github" - - logging.info(f"{prefix}Setting up database") - latest_date_query = f""" - SELECT (metadata_->> 'created_at')::timestamp - AS latest_date - FROM data_{table_name} - ORDER BY (metadata_->>'created_at')::timestamp DESC - LIMIT 1; - """ - from_date_saved_data = setup_db( - community_id=community_id, dbname=dbname, latest_date_query=latest_date_query - ) - from_date: datetime | None - if from_date_saved_data: - from_date = from_date_saved_data - else: - from_date = from_starting_date - - logging.info(f"Fetching data from date: {from_date}") + logging.info(f"{prefix}Processing data!") org_repository_ids = get_github_organization_repos( github_organization_ids=github_org_ids @@ -73,18 +47,10 @@ def process_github_vectorstore( # EXTRACT github_extractor = GithubExtraction() - github_comments = github_extractor.fetch_comments( - repository_id=repository_ids, from_date=from_date - ) - github_commits = github_extractor.fetch_commits( - repository_id=repository_ids, from_date=from_date - ) - github_issues = fetch_issues(repository_id=repository_ids, from_date=from_date) - github_prs = fetch_pull_requests( - repository_id=repository_ids, - from_date_created=from_starting_date, - from_date_updated=from_date_saved_data, - ) + github_comments = github_extractor.fetch_comments(repository_id=repository_ids) + github_commits = github_extractor.fetch_commits(repository_id=repository_ids) + github_issues = fetch_issues(repository_id=repository_ids) + github_prs = fetch_pull_requests(repository_id=repository_ids) # TRANSFORM # llama-index documents @@ -100,29 +66,13 @@ def process_github_vectorstore( logging.debug(f"{prefix}Transforming pull requests!") docs_prs = github_transformation.transform_pull_requests(github_prs) - # there's no update on commits - all_documents: list[Document] = docs_commit.copy() - - # checking for updates on prs, issues, and comments - delete_docs = PrepareDeletion(community_id) - docs_to_save, deletion_query = delete_docs.prepare( - pr_documents=docs_prs, - issue_documents=docs_issue, - comment_documents=docs_comment + docs_issue_comments, + all_documents: list[Document] = ( + docs_commit + docs_comment + docs_issue_comments + docs_prs + docs_issue ) - all_documents.extend(docs_to_save) logging.debug(f"{len(all_documents)} prepared to be saved!") - if len(all_documents) == 0: - logging.info("No new documents to save!") - - logging.info(f"deletion_query: {deletion_query}") # LOAD logging.info(f"{prefix}Loading data into postgres db") - load_documents_into_pg_database( - documents=all_documents, - community_id=community_id, - table_name=table_name, - deletion_query=deletion_query, - ) + ingestion_pipeline = CustomIngestionPipeline(community_id, collection_name="github") + ingestion_pipeline.run_pipeline(docs=all_documents) diff --git a/dags/hivemind_etl_helpers/ingestion_pipeline.py b/dags/hivemind_etl_helpers/ingestion_pipeline.py index 0c76c06b..a49ffbcd 100644 --- a/dags/hivemind_etl_helpers/ingestion_pipeline.py +++ b/dags/hivemind_etl_helpers/ingestion_pipeline.py @@ -1,63 +1,70 @@ -from dags.hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db -from dags.hivemind_etl_helpers.src.utils.redis import RedisSingleton -from llama_index.core import MockEmbedding +import logging + +from hivemind_etl_helpers.src.utils.credentials import load_redis_credentials +from hivemind_etl_helpers.src.utils.mongo import get_mongo_uri +from hivemind_etl_helpers.src.utils.redis import RedisSingleton +from llama_index.core import Document, MockEmbedding from llama_index.core.ingestion import ( DocstoreStrategy, IngestionCache, IngestionPipeline, ) from llama_index.core.node_parser import SemanticSplitterNodeParser -from llama_index.storage.docstore.postgres import PostgresDocumentStore +from llama_index.storage.docstore.mongodb import MongoDocumentStore from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache -from llama_index.vector_stores.postgres import PGVectorStore from tc_hivemind_backend.db.credentials import load_postgres_credentials +from tc_hivemind_backend.db.qdrant import QdrantSingleton from tc_hivemind_backend.db.utils.model_hyperparams import load_model_hyperparams from tc_hivemind_backend.embeddings.cohere import CohereEmbedding +from tc_hivemind_backend.qdrant_vector_access import QDrantVectorAccess class CustomIngestionPipeline: - def __init__(self, community_id: str, table_name: str, testing: bool = False): - self.postgres_credentials = load_postgres_credentials() - self.table_name = table_name - self.dbname = f"community_{community_id}" + def __init__(self, community_id: str, collection_name: str, testing: bool = False): self.community_id = community_id + self.qdrant_client = QdrantSingleton.get_instance().client + + _, self.embedding_dim = load_model_hyperparams() + self.pg_creds = load_postgres_credentials() + self.redis_cred = load_redis_credentials() + self.collection_name = community_id + self.platform_name = collection_name + self.embed_model = ( - CohereEmbedding() if not testing else MockEmbedding(embed_dim=1024) + CohereEmbedding() + if not testing + else MockEmbedding(embed_dim=self.embedding_dim) ) self.redis_client = RedisSingleton.get_instance().get_client() - def run_pipeline(self, docs): - _, embedding_dim = load_model_hyperparams() - setup_db(community_id=self.community_id) + def run_pipeline(self, docs: list[Document]): + # qdrant is just collection based and doesn't have any database + qdrant_collection_name = f"{self.collection_name}_{self.platform_name}" + vector_access = QDrantVectorAccess(collection_name=qdrant_collection_name) + vector_store = vector_access.setup_qdrant_vector_store() + pipeline = IngestionPipeline( transformations=[ SemanticSplitterNodeParser(embed_model=self.embed_model), self.embed_model, ], - docstore=PostgresDocumentStore.from_params( - host=self.postgres_credentials["host"], - port=self.postgres_credentials["port"], - user=self.postgres_credentials["user"], - password=self.postgres_credentials["password"], - database=self.dbname, - table_name=self.table_name + "_docstore", - ), - vector_store=PGVectorStore.from_params( - host=self.postgres_credentials["host"], - port=self.postgres_credentials["port"], - user=self.postgres_credentials["user"], - password=self.postgres_credentials["password"], - database=self.dbname, - table_name=self.table_name, - embed_dim=embedding_dim, + docstore=MongoDocumentStore.from_uri( + uri=get_mongo_uri(), + db_name=f"docstore_{self.collection_name}", + namespace=self.platform_name, ), + vector_store=vector_store, cache=IngestionCache( cache=RedisCache.from_redis_client(self.redis_client), - collection=self.dbname + f"_{self.table_name}" + "_ingestion_cache", + collection=f"{self.collection_name}_{self.platform_name}_ingestion_cache", + docstore_strategy=DocstoreStrategy.UPSERTS, ), docstore_strategy=DocstoreStrategy.UPSERTS, ) - - nodes = pipeline.run(documents=docs, show_progress=True) - - return nodes + try: + nodes = pipeline.run(documents=docs, show_progress=True) + return nodes + except Exception as e: + logging.error( + f"An error occurred while running the pipeline: {e}", exc_info=True + ) diff --git a/dags/hivemind_etl_helpers/notion_etl.py b/dags/hivemind_etl_helpers/notion_etl.py index f8260e98..00c88ef1 100644 --- a/dags/hivemind_etl_helpers/notion_etl.py +++ b/dags/hivemind_etl_helpers/notion_etl.py @@ -1,7 +1,7 @@ import logging -from dags.hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline -from dags.hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor +from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline +from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor def process_notion_etl( @@ -41,8 +41,10 @@ def process_notion_etl( except TypeError as exp: logging.info(f"No documents retrieved from notion! exp: {exp}") - table_name = "notion" - ingestion_pipeline = CustomIngestionPipeline(community_id, table_name=table_name) + collection_name = "notion" + ingestion_pipeline = CustomIngestionPipeline( + community_id, collection_name=collection_name + ) try: ingestion_pipeline.run_pipeline(docs=documents) except Exception as e: diff --git a/dags/hivemind_etl_helpers/src/utils/credentials.py b/dags/hivemind_etl_helpers/src/utils/credentials.py index bd01b6e6..d4dac733 100644 --- a/dags/hivemind_etl_helpers/src/utils/credentials.py +++ b/dags/hivemind_etl_helpers/src/utils/credentials.py @@ -44,10 +44,20 @@ def load_redis_credentials() -> dict[str, str]: """ load_dotenv() - redis_creds: dict[str, str] = {} - - redis_creds["port"] = os.getenv("REDIS_PORT", "") - redis_creds["password"] = os.getenv("REDIS_PASSWORD", "") - redis_creds["host"] = os.getenv("REDIS_HOST", "") - + host = os.getenv("REDIS_HOST") + port = os.getenv("REDIS_PORT") + password = os.getenv("REDIS_PASSWORD") + + if host is None: + raise ValueError("`REDIS_HOST` is not set in env credentials!") + if port is None: + raise ValueError("`REDIS_PORT` is not set in env credentials!") + if password is None: + raise ValueError("`REDIS_PASSWORD` is not set in env credentials!") + + redis_creds: dict[str, str] = { + "host": host, + "port": port, + "password": password, + } return redis_creds diff --git a/dags/hivemind_etl_helpers/src/utils/mongo.py b/dags/hivemind_etl_helpers/src/utils/mongo.py index 4259cdaf..46e0e30f 100644 --- a/dags/hivemind_etl_helpers/src/utils/mongo.py +++ b/dags/hivemind_etl_helpers/src/utils/mongo.py @@ -1,5 +1,4 @@ import logging -from typing import Any from pymongo import MongoClient @@ -13,8 +12,7 @@ def __init__(self): if MongoSingleton.__instance is not None: raise Exception("This class is a singleton!") else: - creds = load_mongo_credentials() - connection_uri = config_mogno_creds(creds) + connection_uri = get_mongo_uri() self.client = MongoClient(connection_uri) MongoSingleton.__instance = self @@ -34,7 +32,8 @@ def get_client(self): return self.client -def config_mogno_creds(mongo_creds: dict[str, Any]): +def get_mongo_uri() -> str: + mongo_creds = load_mongo_credentials() user = mongo_creds["user"] password = mongo_creds["password"] host = mongo_creds["host"] diff --git a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py index dfe3d8f8..f68ef1f2 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_github_etl_fetch_commits.py @@ -1,8 +1,8 @@ from datetime import datetime from unittest import TestCase -from dags.hivemind_etl_helpers.src.db.github.extract import GithubExtraction from github.neo4j_storage.neo4j_connection import Neo4jConnection +from hivemind_etl_helpers.src.db.github.extract import GithubExtraction class TestFetchCommits(TestCase): diff --git a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py index 2afaa72a..5d7f2c6b 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_ingestion_pipeline_etl.py @@ -1,33 +1,18 @@ import unittest from unittest.mock import Mock -from dags.hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline -from hivemind_etl_helpers.src.db.gdrive.db_utils import setup_db +from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline from llama_index.core.ingestion import IngestionPipeline from llama_index.core.schema import Document -from tc_hivemind_backend.db.credentials import load_postgres_credentials class TestIngestionPipeline(unittest.TestCase): - def setUpDB(self, community: str): - self.db_name = f"community_{community}" - self.creds = load_postgres_credentials() - setup_db(community) - self.db_config = { - "database": self.db_name, - "user": self.creds["user"], - "password": self.creds["password"], - "host": self.creds["host"], - "port": self.creds["port"], - } - def test_run_pipeline(self): ingest_pipeline = Mock(IngestionPipeline) community = "1234" - table_name = "gdrive" - self.setUpDB(community) - gdrive_pipeline = CustomIngestionPipeline( - "1234", table_name=table_name, testing=True + collection_name = "gdrive" + ingestion_pipeline = CustomIngestionPipeline( + community_id=community, collection_name=collection_name, testing=True ) docs = [ Document( @@ -40,18 +25,18 @@ def test_run_pipeline(self): ), ] - processed_result = gdrive_pipeline.run_pipeline(docs) + processed_result = ingestion_pipeline.run_pipeline(docs) ingest_pipeline.run.return_value = processed_result self.assertEqual(len(processed_result), 2) def test_load_pipeline_run_exception(self): - gdrive_pipeline = CustomIngestionPipeline( - "1234", table_name="gdrive", testing=True + ingestion_pipeline = CustomIngestionPipeline( + "1234", collection_name="gdrive", testing=True ) - gdrive_pipeline.run_pipeline = Mock() - gdrive_pipeline.run_pipeline.side_effect = Exception("Test Exception") + ingestion_pipeline.run_pipeline = Mock() + ingestion_pipeline.run_pipeline.side_effect = Exception("Test Exception") docs = ["ww"] with self.assertRaises(Exception) as context: - gdrive_pipeline.run_pipeline(docs) + ingestion_pipeline.run_pipeline(docs) self.assertEqual(str(context.exception), "Test Exception") - gdrive_pipeline.run_pipeline.assert_called_with(docs) + ingestion_pipeline.run_pipeline.assert_called_with(docs) diff --git a/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py b/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py index ae182c0c..daec8b08 100644 --- a/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py +++ b/dags/hivemind_etl_helpers/tests/integration/test_load_envs.py @@ -4,6 +4,7 @@ load_mongo_credentials, load_redis_credentials, ) +from hivemind_etl_helpers.src.utils.mongo import get_mongo_uri class TestCredentialLoadings(unittest.TestCase): @@ -33,10 +34,17 @@ def test_redis_envs_check_type(self): def test_redis_envs_values(self): redis_creds = load_redis_credentials() - self.assertNotEqual(redis_creds["password"], "") - self.assertNotEqual(redis_creds["host"], "") - self.assertNotEqual(redis_creds["port"], "") + self.assertIsNotNone(redis_creds["password"]) + self.assertIsNotNone(redis_creds["host"]) + self.assertIsNotNone(redis_creds["port"]) self.assertIsInstance(redis_creds["password"], str) self.assertIsInstance(redis_creds["host"], str) self.assertIsInstance(redis_creds["port"], str) + + def test_config_mongo_creds(self): + mongo_uri = get_mongo_uri() + + self.assertIsInstance(mongo_uri, str) + self.assertIn("mongodb://", mongo_uri) + self.assertNotIn("None", mongo_uri) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_notion_extractor.py b/dags/hivemind_etl_helpers/tests/unit/test_notion_extractor.py index 9f89e8f3..16f9fd22 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_notion_extractor.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_notion_extractor.py @@ -1,7 +1,7 @@ import unittest from unittest.mock import Mock -from dags.hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor +from hivemind_etl_helpers.src.db.notion.extractor import NotionExtractor class TestNotionExtractorLive(unittest.TestCase): diff --git a/dags/hivemind_google_drive_etl.py b/dags/hivemind_google_drive_etl.py index f5e39518..acf358dc 100644 --- a/dags/hivemind_google_drive_etl.py +++ b/dags/hivemind_google_drive_etl.py @@ -5,7 +5,7 @@ from airflow import DAG from airflow.decorators import task -from hivemind_etl_helpers.gdrive_ingestion_etl import GoogleDriveIngestionPipeline +from hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline from hivemind_etl_helpers.src.db.gdrive.gdrive_loader import GoogleDriveLoader from hivemind_etl_helpers.src.utils.modules import ModulesGDrive @@ -37,7 +37,9 @@ def process_gdrive_data( file_ids=file_ids, folder_ids=folder_ids, drive_ids=drive_ids ) - ingest_data = GoogleDriveIngestionPipeline(community_id=community_id) + ingest_data = CustomIngestionPipeline( + community_id=community_id, collection_name="gdrive" + ) ingest_data.run_pipeline(load_file_data) communities_info = get_gdrive_communities() diff --git a/dags/hivemind_notion_etl.py b/dags/hivemind_notion_etl.py index edbe2d6d..3bf36b95 100644 --- a/dags/hivemind_notion_etl.py +++ b/dags/hivemind_notion_etl.py @@ -5,7 +5,7 @@ from airflow import DAG from airflow.decorators import task -from dags.hivemind_etl_helpers.notion_etl import process_notion_etl +from hivemind_etl_helpers.notion_etl import process_notion_etl from hivemind_etl_helpers.src.utils.modules import ModulesNotion with DAG( diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 6d66fc65..5b79f55b 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -32,6 +32,9 @@ services: - REDIS_HOST=redis - REDIS_PORT=6379 - REDIS_PASSWORD=password + - QDRANT_HOST=qdrant + - QDRANT_PORT=6333 + - QDRANT_API_KEY= volumes: - ./coverage:/project/coverage depends_on: @@ -43,6 +46,8 @@ services: condition: service_healthy redis: condition: service_healthy + qdrant-healthcheck: + condition: service_healthy neo4j: image: "neo4j:5.9.0" environment: @@ -90,3 +95,27 @@ services: retries: 50 start_period: 30s restart: always + qdrant: + image: qdrant/qdrant:v1.9.2 + restart: always + container_name: qdrant + ports: + - 6333:6333 + - 6334:6334 + expose: + - 6333 + - 6334 + - 6335 + volumes: + - ./qdrant_data:/qdrant_data + qdrant-healthcheck: + restart: always + image: curlimages/curl:latest + entrypoint: ["/bin/sh", "-c", "--", "while true; do sleep 30; done;"] + depends_on: + - qdrant + healthcheck: + test: ["CMD", "curl", "-f", "http://qdrant:6333/readyz"] + interval: 10s + timeout: 2s + retries: 5 diff --git a/docker-compose.yaml b/docker-compose.yaml index 61fb8470..15c2b068 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -65,7 +65,10 @@ x-airflow-common: &airflow-common condition: service_healthy neo4j: condition: service_healthy - + qdrant-healthcheck: + condition: service_healthy + mongo: + condition: service_healthy services: postgres: image: ankane/pgvector @@ -80,12 +83,34 @@ services: # volumes: # - postgres-db-volume:/var/lib/postgresql/data healthcheck: - test: ["CMD", "pg_isready", "-U", "airflow"] + test: pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB interval: 10s retries: 5 start_period: 5s restart: always + qdrant: + image: qdrant/qdrant:v1.9.2 + restart: always + container_name: qdrant + ports: + - 6333:6333 + expose: + - 6333 + volumes: + - ./qdrant_data:/qdrant_data + qdrant-healthcheck: + restart: always + image: curlimages/curl:latest + entrypoint: ["/bin/sh", "-c", "--", "while true; do sleep 30; done;"] + depends_on: + - qdrant + healthcheck: + test: ["CMD", "curl", "-f", "http://qdrant:6333/readyz"] + interval: 10s + timeout: 2s + retries: 5 + redis: image: redis:latest expose: @@ -109,10 +134,11 @@ services: - MONGO_INITDB_ROOT_PASSWORD=pass healthcheck: test: echo 'db.stats().ok' | mongosh localhost:27017/test --quiet - interval: 60s + interval: 30s timeout: 10s - retries: 2 + retries: 3 start_period: 40s + # restart: always neo4j: image: "neo4j:5.9.0" @@ -122,7 +148,7 @@ services: - NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.* healthcheck: test: ["CMD", "wget", "http://localhost:7474"] - interval: 1m30s + interval: 1m timeout: 10s retries: 2 start_period: 40s diff --git a/init.sh b/init.sh index 2b9294f1..1b686701 100755 --- a/init.sh +++ b/init.sh @@ -5,18 +5,6 @@ function ver() { IFS='.' read -r major minor patch <<< "$version" printf "%04d%04d%04d" "${major:-0}" "${minor:-0}" "${patch:-0}" } -# shellcheck disable=SC2034 -airflow_version=$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) -airflow_version_comparable=$(ver "${airflow_version}") -min_airflow_version=2.2.0 -min_airflow_version_comparable=$(ver "${min_airflow_version}") -if (( airflow_version_comparable < min_airflow_version_comparable )); then - echo - echo -e "\033[1;31mERROR!!!: Too old Airflow version ${airflow_version}!\e[0m" - echo "The minimum Airflow version supported: ${min_airflow_version}. Only use this or higher!" - echo - exit 1 -fi if [[ -z "${AIRFLOW_UID}" ]]; then echo echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" diff --git a/requirements.txt b/requirements.txt index a7a7cd8b..42fe1d9e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ numpy==1.26.0 pymongo==4.6.2 +motor==3.4.0 pgvector==0.2.5 asyncpg==0.29.0 psycopg2-binary==2.9.9 @@ -15,12 +16,11 @@ coverage>=7.3.3, <8.0.0 pytest>=7.4.3, <8.0.0 python-dotenv>=1.0.0, <2.0.0 urlextract==1.8.0 -tc-hivemind-backend==1.1.6 +tc-hivemind-backend==1.2.0 traceloop-sdk>=0.15.2, <0.16.0 beautifulsoup4==4.12.3 llama-index-readers-notion==0.1.6 llama-index-readers-google==0.2.5 -llama-index-storage-docstore-postgres==0.1.3 llama-index-storage-docstore-redis==0.1.2 -llama-index-vector-stores-postgres==0.1.7 +llama-index-storage-docstore-mongodb==0.1.3 redis===5.0.4