Skip to content

Commit

Permalink
feat: initializing the bot!
Browse files Browse the repository at this point in the history
  • Loading branch information
amindadgar committed Dec 28, 2023
1 parent 652a0db commit a467be0
Show file tree
Hide file tree
Showing 28 changed files with 847 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.github/

.coverage/
.coverage
coverage

venv/
.env
12 changes: 12 additions & 0 deletions .github/workflows/production.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Production CI/CD Pipeline

on:
push:
branches:
- main

jobs:
ci:
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
secrets:
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
9 changes: 9 additions & 0 deletions .github/workflows/start.staging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: Staging CI/CD Pipeline

on: pull_request

jobs:
ci:
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
secrets:
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

hivemind-bot-env/*
12 changes: 12 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# It's recommended that we use `bullseye` for Python (alpine isn't suitable as it conflcts with numpy)
FROM python:3.11-bullseye AS base
WORKDIR /project
COPY . .
RUN pip3 install -r requirements.txt

FROM base AS test
RUN chmod +x docker-entrypoint.sh
CMD ["./docker-entrypoint.sh"]

FROM base AS prod
CMD ["python3", "celery", "-A", "celery_app.server", "worker", "-l", "INFO"]
Empty file added celery_app/__init__.py
Empty file.
30 changes: 30 additions & 0 deletions celery_app/job_send.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from tc_messageBroker import RabbitMQ
from tc_messageBroker.rabbit_mq.event import Event
from tc_messageBroker.rabbit_mq.queue import Queue


def job_send(broker_url, port, username, password, res):
rabbit_mq = RabbitMQ(
broker_url=broker_url, port=port, username=username, password=password
)

content = {
"uuid": "d99a1490-fba6-11ed-b9a9-0d29e7612dp8",
"data": f"some results {res}",
}

rabbit_mq.connect(Queue.DISCORD_ANALYZER)
rabbit_mq.publish(
queue_name=Queue.DISCORD_ANALYZER,
event=Event.DISCORD_BOT.FETCH,
content=content,
)


if __name__ == "__main__":
# TODO: read from .env
broker_url = "localhost"
port = 5672
username = "root"
password = "pass"
job_send(broker_url, port, username, password, "CALLED FROM __main__")
5 changes: 5 additions & 0 deletions celery_app/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from celery import Celery

# TODO: read from .env
app = Celery("celery_app/tasks", broker="pyamqp://root:pass@localhost//")
app.autodiscover_tasks(["celery_app"])
27 changes: 27 additions & 0 deletions celery_app/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from celery_app.server import app
from celery_app.job_send import job_send

# TODO: Write tasks that match our requirements


@app.task
def add(x, y):
broker_url = "localhost"
port = 5672
username = "root"
password = "pass"

res = x + y
job_send(broker_url, port, username, password, res)

return res


@app.task
def mul(x, y):
return x * y


@app.task
def xsum(numbers):
return sum(numbers)
147 changes: 147 additions & 0 deletions discord_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from retrievers.forum_summary_retriever import (
ForumBasedSummaryRetriever,
)
from retrievers.process_dates import process_dates
from retrievers.utils.load_hyperparams import load_hyperparams
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
from tc_hivemind_backend.pg_vector_access import PGVectorAccess
from llama_index import QueryBundle
from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters


def query_discord(
community_id: str,
query: str,
thread_names: list[str],
channel_names: list[str],
days: list[str],
similarity_top_k: int | None = None,
) -> str:
"""
query the discord database using filters given
and give an anwer to the given query using the LLM
Parameters
------------
guild_id : str
the discord guild data to query
query : str
the query (question) of the user
thread_names : list[str]
the given threads to search for
channel_names : list[str]
the given channels to search for
days : list[str]
the given days to search for
similarity_top_k : int | None
the k similar results to use when querying the data
if `None` will load from `.env` file
Returns
---------
response : str
the LLM response given the query
"""
if similarity_top_k is None:
_, similarity_top_k, _ = load_hyperparams()

table_name = "discord"
dbname = f"community_{community_id}"

pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)

index = pg_vector.load_index()

thread_filters: list[ExactMatchFilter] = []
channel_filters: list[ExactMatchFilter] = []
day_filters: list[ExactMatchFilter] = []

for channel in channel_names:
channel_updated = channel.replace("'", "''")
channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated))

for thread in thread_names:
thread_updated = thread.replace("'", "''")
thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated))

for day in days:
day_filters.append(ExactMatchFilter(key="date", value=day))

all_filters: list[ExactMatchFilter] = []
all_filters.extend(thread_filters)
all_filters.extend(channel_filters)
all_filters.extend(day_filters)

filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR)

query_engine = index.as_query_engine(
filters=filters, similarity_top_k=similarity_top_k
)

query_bundle = QueryBundle(
query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query)
)
response = query_engine.query(query_bundle)

return response.response


def query_discord_auto_filter(
community_id: str,
query: str,
similarity_top_k: int | None = None,
d: int | None = None,
) -> str:
"""
get the query results and do the filtering automatically.
By automatically we mean, it would first query the summaries
to get the metadata filters
Parameters
-----------
guild_id : str
the discord guild data to query
query : str
the query (question) of the user
similarity_top_k : int | None
the value for the initial summary search
to get the `k2` count simliar nodes
if `None`, then would read from `.env`
d : int
this would make the secondary search (`query_discord`)
to be done on the `metadata.date - d` to `metadata.date + d`
Returns
---------
response : str
the LLM response given the query
"""
table_name = "discord_summary"
dbname = f"community_{community_id}"

if d is None:
_, _, d = load_hyperparams()
if similarity_top_k is None:
similarity_top_k, _, _ = load_hyperparams()

discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname)

channels, threads, dates = discord_retriever.retreive_metadata(
query=query,
metadata_group1_key="channel",
metadata_group2_key="thread",
metadata_date_key="date",
similarity_top_k=similarity_top_k,
)

dates_modified = process_dates(dates, d)

response = query_discord(
community_id=community_id,
query=query,
thread_names=threads,
channel_names=channels,
days=dates_modified,
)
return response
14 changes: 14 additions & 0 deletions docker-compose.example.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
version: "3.9"

services:
server:
build:
context: .
target: prod
dockerfile: Dockerfile
worker:
build:
context: .
target: prod
dockerfile: Dockerfile
command: python3 worker.py
71 changes: 71 additions & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
version: "3.9"

services:
app:
build:
context: .
target: test
dockerfile: Dockerfile
environment:
- PORT=3000
- MONGODB_HOST=mongo
- MONGODB_PORT=27017
- MONGODB_USER=root
- MONGODB_PASS=pass
- NEO4J_PROTOCOL=bolt
- NEO4J_HOST=neo4j
- NEO4J_PORT=7687
- NEO4J_USER=neo4j
- NEO4J_PASSWORD=password
- NEO4J_DB=neo4j
- POSTGRES_HOST=postgres
- POSTGRES_USER=root
- POSTGRES_PASS=pass
- POSTGRES_PORT=5432
- CHUNK_SIZE=512
- EMBEDDING_DIM=1024
- K1_RETRIEVER_SEARCH=20
- K2_RETRIEVER_SEARCH=5
- D_RETRIEVER_SEARCH=7
volumes:
- ./coverage:/project/coverage
depends_on:
neo4j:
condition: service_healthy
mongo:
condition: service_healthy
postgres:
condition: service_healthy
neo4j:
image: "neo4j:5.9.0"
environment:
- NEO4J_AUTH=neo4j/password
- NEO4J_PLUGINS=["apoc", "graph-data-science"]
- NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.*
healthcheck:
test: ["CMD" ,"wget", "http://localhost:7474"]
interval: 1m30s
timeout: 10s
retries: 2
start_period: 40s
mongo:
image: "mongo:6.0.8"
environment:
- MONGO_INITDB_ROOT_USERNAME=root
- MONGO_INITDB_ROOT_PASSWORD=pass
healthcheck:
test: echo 'db.stats().ok' | mongosh localhost:27017/test --quiet
interval: 60s
timeout: 10s
retries: 2
start_period: 40s
postgres:
image: "ankane/pgvector"
environment:
- POSTGRES_USER=root
- POSTGRES_PASSWORD=pass
healthcheck:
test: ["CMD-SHELL", "pg_isready"]
interval: 10s
timeout: 5s
retries: 5
3 changes: 3 additions & 0 deletions docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash
python3 -m coverage run --omit=tests/* -m pytest .
python3 -m coverage lcov -o coverage/lcov.info
20 changes: 20 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
numpy
llama-index>=0.9.21, <1.0.0
pymongo
python-dotenv
pgvector
asyncpg
psycopg2-binary
sqlalchemy[asyncio]
async-sqlalchemy
python-pptx
tc-neo4j-lib
google-api-python-client
unstructured
cohere
neo4j>=5.14.1, <6.0.0
coverage>=7.3.3, <8.0.0
pytest>=7.4.3, <8.0.0
python-dotenv==1.0.0
tc_hivemind_backend==1.0.0
celery>=5.3.6, <6.0.0
Empty file added retrievers/__init__.py
Empty file.
Loading

0 comments on commit a467be0

Please sign in to comment.