feat: initializing the bot!

TogetherCrew · Dec 28, 2023 · a467be0 · a467be0
1 parent 652a0db
commit a467be0
Show file tree

Hide file tree

Showing 28 changed files with 847 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+.github/
+
+.coverage/
+.coverage
+coverage
+
+venv/
+.env
diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml
@@ -0,0 +1,12 @@
+name: Production CI/CD Pipeline
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  ci:
+    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    secrets:
+      CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/.github/workflows/start.staging.yml b/.github/workflows/start.staging.yml
@@ -0,0 +1,9 @@
+name: Staging CI/CD Pipeline
+
+on: pull_request
+
+jobs:
+  ci:
+    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    secrets:
+      CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+hivemind-bot-env/*
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+# It's recommended that we use `bullseye` for Python (alpine isn't suitable as it conflcts with numpy)
+FROM python:3.11-bullseye AS base 
+WORKDIR /project
+COPY . .
+RUN pip3 install -r requirements.txt
+
+FROM base AS test
+RUN chmod +x docker-entrypoint.sh
+CMD ["./docker-entrypoint.sh"]
+
+FROM base AS prod
+CMD ["python3", "celery", "-A", "celery_app.server", "worker", "-l", "INFO"]
diff --git a/celery_app/__init__.py b/celery_app/__init__.py
diff --git a/celery_app/job_send.py b/celery_app/job_send.py
@@ -0,0 +1,30 @@
+from tc_messageBroker import RabbitMQ
+from tc_messageBroker.rabbit_mq.event import Event
+from tc_messageBroker.rabbit_mq.queue import Queue
+
+
+def job_send(broker_url, port, username, password, res):
+    rabbit_mq = RabbitMQ(
+        broker_url=broker_url, port=port, username=username, password=password
+    )
+
+    content = {
+        "uuid": "d99a1490-fba6-11ed-b9a9-0d29e7612dp8",
+        "data": f"some results {res}",
+    }
+
+    rabbit_mq.connect(Queue.DISCORD_ANALYZER)
+    rabbit_mq.publish(
+        queue_name=Queue.DISCORD_ANALYZER,
+        event=Event.DISCORD_BOT.FETCH,
+        content=content,
+    )
+
+
+if __name__ == "__main__":
+    # TODO: read from .env
+    broker_url = "localhost"
+    port = 5672
+    username = "root"
+    password = "pass"
+    job_send(broker_url, port, username, password, "CALLED FROM __main__")
diff --git a/celery_app/server.py b/celery_app/server.py
@@ -0,0 +1,5 @@
+from celery import Celery
+
+# TODO: read from .env
+app = Celery("celery_app/tasks", broker="pyamqp://root:pass@localhost//")
+app.autodiscover_tasks(["celery_app"])
diff --git a/celery_app/tasks.py b/celery_app/tasks.py
@@ -0,0 +1,27 @@
+from celery_app.server import app
+from celery_app.job_send import job_send
+
+# TODO: Write tasks that match our requirements
+
+
+@app.task
+def add(x, y):
+    broker_url = "localhost"
+    port = 5672
+    username = "root"
+    password = "pass"
+
+    res = x + y
+    job_send(broker_url, port, username, password, res)
+
+    return res
+
+
+@app.task
+def mul(x, y):
+    return x * y
+
+
+@app.task
+def xsum(numbers):
+    return sum(numbers)
diff --git a/discord_query.py b/discord_query.py
@@ -0,0 +1,147 @@
+from retrievers.forum_summary_retriever import (
+    ForumBasedSummaryRetriever,
+)
+from retrievers.process_dates import process_dates
+from retrievers.utils.load_hyperparams import load_hyperparams
+from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
+from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+from llama_index import QueryBundle
+from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters
+
+
+def query_discord(
+    community_id: str,
+    query: str,
+    thread_names: list[str],
+    channel_names: list[str],
+    days: list[str],
+    similarity_top_k: int | None = None,
+) -> str:
+    """
+    query the discord database using filters given
+    and give an anwer to the given query using the LLM
+
+    Parameters
+    ------------
+    guild_id : str
+        the discord guild data to query
+    query : str
+        the query (question) of the user
+    thread_names : list[str]
+        the given threads to search for
+    channel_names : list[str]
+        the given channels to search for
+    days : list[str]
+        the given days to search for
+    similarity_top_k : int | None
+        the k similar results to use when querying the data
+        if `None` will load from `.env` file
+
+    Returns
+    ---------
+    response : str
+        the LLM response given the query
+    """
+    if similarity_top_k is None:
+        _, similarity_top_k, _ = load_hyperparams()
+
+    table_name = "discord"
+    dbname = f"community_{community_id}"
+
+    pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
+
+    index = pg_vector.load_index()
+
+    thread_filters: list[ExactMatchFilter] = []
+    channel_filters: list[ExactMatchFilter] = []
+    day_filters: list[ExactMatchFilter] = []
+
+    for channel in channel_names:
+        channel_updated = channel.replace("'", "''")
+        channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated))
+
+    for thread in thread_names:
+        thread_updated = thread.replace("'", "''")
+        thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated))
+
+    for day in days:
+        day_filters.append(ExactMatchFilter(key="date", value=day))
+
+    all_filters: list[ExactMatchFilter] = []
+    all_filters.extend(thread_filters)
+    all_filters.extend(channel_filters)
+    all_filters.extend(day_filters)
+
+    filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR)
+
+    query_engine = index.as_query_engine(
+        filters=filters, similarity_top_k=similarity_top_k
+    )
+
+    query_bundle = QueryBundle(
+        query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query)
+    )
+    response = query_engine.query(query_bundle)
+
+    return response.response
+
+
+def query_discord_auto_filter(
+    community_id: str,
+    query: str,
+    similarity_top_k: int | None = None,
+    d: int | None = None,
+) -> str:
+    """
+    get the query results and do the filtering automatically.
+    By automatically we mean, it would first query the summaries
+    to get the metadata filters
+
+    Parameters
+    -----------
+    guild_id : str
+        the discord guild data to query
+    query : str
+        the query (question) of the user
+    similarity_top_k : int | None
+        the value for the initial summary search
+        to get the `k2` count simliar nodes
+        if `None`, then would read from `.env`
+    d : int
+        this would make the secondary search (`query_discord`)
+        to be done on the `metadata.date - d` to `metadata.date + d`
+
+
+    Returns
+    ---------
+    response : str
+        the LLM response given the query
+    """
+    table_name = "discord_summary"
+    dbname = f"community_{community_id}"
+
+    if d is None:
+        _, _, d = load_hyperparams()
+    if similarity_top_k is None:
+        similarity_top_k, _, _ = load_hyperparams()
+
+    discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname)
+
+    channels, threads, dates = discord_retriever.retreive_metadata(
+        query=query,
+        metadata_group1_key="channel",
+        metadata_group2_key="thread",
+        metadata_date_key="date",
+        similarity_top_k=similarity_top_k,
+    )
+
+    dates_modified = process_dates(dates, d)
+
+    response = query_discord(
+        community_id=community_id,
+        query=query,
+        thread_names=threads,
+        channel_names=channels,
+        days=dates_modified,
+    )
+    return response
diff --git a/docker-compose.example.yml b/docker-compose.example.yml
@@ -0,0 +1,14 @@
+version: "3.9"
+
+services:
+  server:
+    build:
+      context: .
+      target: prod
+      dockerfile: Dockerfile
+  worker:
+    build:
+      context: .
+      target: prod
+      dockerfile: Dockerfile
+    command: python3 worker.py
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -0,0 +1,71 @@
+version: "3.9"
+
+services:
+  app:
+    build:
+      context: .
+      target: test
+      dockerfile: Dockerfile
+    environment:
+      - PORT=3000
+      - MONGODB_HOST=mongo
+      - MONGODB_PORT=27017
+      - MONGODB_USER=root
+      - MONGODB_PASS=pass
+      - NEO4J_PROTOCOL=bolt
+      - NEO4J_HOST=neo4j
+      - NEO4J_PORT=7687
+      - NEO4J_USER=neo4j
+      - NEO4J_PASSWORD=password
+      - NEO4J_DB=neo4j
+      - POSTGRES_HOST=postgres
+      - POSTGRES_USER=root
+      - POSTGRES_PASS=pass
+      - POSTGRES_PORT=5432
+      - CHUNK_SIZE=512
+      - EMBEDDING_DIM=1024
+      - K1_RETRIEVER_SEARCH=20
+      - K2_RETRIEVER_SEARCH=5
+      - D_RETRIEVER_SEARCH=7
+    volumes:
+      - ./coverage:/project/coverage
+    depends_on:
+      neo4j:
+        condition: service_healthy
+      mongo:
+        condition: service_healthy
+      postgres:
+        condition: service_healthy
+  neo4j:
+    image: "neo4j:5.9.0"
+    environment:
+      - NEO4J_AUTH=neo4j/password
+      - NEO4J_PLUGINS=["apoc", "graph-data-science"]
+      - NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.*
+    healthcheck:
+      test: ["CMD" ,"wget", "http://localhost:7474"]
+      interval: 1m30s
+      timeout: 10s
+      retries: 2
+      start_period: 40s
+  mongo:
+    image: "mongo:6.0.8"
+    environment:
+      - MONGO_INITDB_ROOT_USERNAME=root
+      - MONGO_INITDB_ROOT_PASSWORD=pass
+    healthcheck:
+      test: echo 'db.stats().ok' | mongosh localhost:27017/test --quiet
+      interval: 60s
+      timeout: 10s
+      retries: 2
+      start_period: 40s
+  postgres:
+    image: "ankane/pgvector"
+    environment:
+      - POSTGRES_USER=root
+      - POSTGRES_PASSWORD=pass
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+python3 -m coverage run --omit=tests/* -m pytest .
+python3 -m coverage lcov -o coverage/lcov.info
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,20 @@
+numpy
+llama-index>=0.9.21, <1.0.0
+pymongo
+python-dotenv
+pgvector
+asyncpg
+psycopg2-binary
+sqlalchemy[asyncio]
+async-sqlalchemy
+python-pptx
+tc-neo4j-lib
+google-api-python-client
+unstructured
+cohere
+neo4j>=5.14.1, <6.0.0
+coverage>=7.3.3, <8.0.0
+pytest>=7.4.3, <8.0.0
+python-dotenv==1.0.0
+tc_hivemind_backend==1.0.0
+celery>=5.3.6, <6.0.0
diff --git a/retrievers/__init__.py b/retrievers/__init__.py