From 4a66e98638951c14a7d93fd9ec95abf9dbbbe64b Mon Sep 17 00:00:00 2001
From: Daniel Roy Greenfeld <62857+pydanny@users.noreply.github.com>
Date: Mon, 30 Oct 2023 17:05:36 +0000
Subject: [PATCH] Break out LLMs into their own dependency and tests

This will speed up tests and also allows for experimental efforts like using transformers.
---
 .github/workflows/ci-llms.yml            | 44 ++++++++++++++++++++++++
 .github/workflows/ci.yml                 |  7 ++--
 CONTRIBUTING.md                          | 14 ++++++++
 Makefile                                 |  5 +++
 pyproject.toml                           | 10 ++++--
 src/interviewkit/questions.py            | 36 +++++++++++++------
 src/interviewkit/transcript_using_m5.py  | 28 +++++++--------
 tests/{ => code}/test_cli_version.py     |  0
 tests/{ => llms}/test_install_package.py |  0
 9 files changed, 113 insertions(+), 31 deletions(-)
 create mode 100644 .github/workflows/ci-llms.yml
 rename tests/{ => code}/test_cli_version.py (100%)
 rename tests/{ => llms}/test_install_package.py (100%)

diff --git a/.github/workflows/ci-llms.yml b/.github/workflows/ci-llms.yml
new file mode 100644
index 0000000..7690b9b
--- /dev/null
+++ b/.github/workflows/ci-llms.yml
@@ -0,0 +1,44 @@
+name: Project Tests with LLMS
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Free up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[test]'
+          pip install -e '.[clarifai_grpc]'
+          pip install -e '.[transformers]'          
+
+      - name: Install ffmpeg
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ffmpeg
+
+      - name: Run code and llm tests
+        run: |
+          make test_llms
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ba989a8..7da3cfa 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,9 +37,6 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y ffmpeg
 
-      - name: Find and run pytest tests
+      - name: Run code and non-llm tests 
         run: |
-          test_files=$(find ./tests/ -name "*test*.py")
-          
-          # Run pytest on all found test files
-          python -m pytest $test_files
+          make test
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index fbec2a0..8b28bb2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -133,6 +133,20 @@ ruff check . --fix
 ruff format .
 ```
 
+#### How to run tests
+
+To run the non-LLM tests (fast), run at the project root:
+
+```sh
+pytest tests/code/*.py
+```
+
+To run the tests with LLMs,  run at the project root:
+
+```sh
+pytest tests/llms/*.py
+```
+
 #### How Do I Submit a Good Enhancement Suggestion?
 
 Enhancement suggestions are tracked as [GitHub issues](https://github.com/historysciencelab/HistoryAIToolkit/issues).
diff --git a/Makefile b/Makefile
index bedc779..e642343 100644
--- a/Makefile
+++ b/Makefile
@@ -4,3 +4,8 @@ format:
 lint:
 	ruff check . --fix
 
+test:
+	pytest tests/code/*.py
+
+test_llms:
+	pytest tests/llms/*.py
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 82b4c5d..bca652e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,13 +18,11 @@ classifiers = [
 ]
 license = {text = "GNU General Public License v3"}
 dependencies = [
-    "clarifai_grpc==9.9.1",
     "openai-whisper==20230918",
     "pydantic==2.4.2",
     "pydantic-settings==2.0.3",
     "pydub==0.25.1",
     "rich==13.6.0",
-    "transformers==4.34.0",
     "typer==0.9.0",
     "yarl==1.9.2"
 ]
@@ -49,6 +47,14 @@ docs = [
     "mkdocs-include-markdown-plugin"
 ]
 
+clarifai_grpc = [
+    "clarifai_grpc==9.9.1"
+]
+
+transformers = [
+    "transformers==4.34.0"
+]
+
 [project.urls]
 
 bugs = "https://github.com/audreyfeldroy/historyaitoolkit/issues"
diff --git a/src/interviewkit/questions.py b/src/interviewkit/questions.py
index f7546c8..030bf6a 100644
--- a/src/interviewkit/questions.py
+++ b/src/interviewkit/questions.py
@@ -1,8 +1,13 @@
-from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel
-from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc
-from clarifai_grpc.grpc.api.status import status_code_pb2
+from rich.console import Console
 
 
+console = Console()
+
+try:
+    import clarifai_grpc
+except ImportError:
+    clarifai_grpc = None
+
 # # Securely get your credentials
 # TODO: Pass in arguments or use env vars
 CLARIFAI_PAT = ""
@@ -16,29 +21,40 @@
 
 
 def generate_questions_from_transcript(transcript: str):
-    channel = ClarifaiChannel.get_grpc_channel()
-    stub = service_pb2_grpc.V2Stub(channel)
+    if clarifai_grpc is None:
+        console.print(
+            "Please install clarifai-grpc: pip install 'historyaitoolkit\[clarifai]'",
+            style="bold red",
+        )
+        exit(1)
+    channel = clarifai_grpc.channel.clarifai_channel.ClarifaiChannel.get_grpc_channel()
+    stub = clarifai_grpc.grpc.api.service_pb2_grpc.V2Stub(channel)
 
     metadata = (("authorization", "Key " + CLARIFAI_PAT),)
-    userDataObject = resources_pb2.UserAppIDSet(
+    userDataObject = clarifai_grpc.grpc.api.resources_pb2.UserAppIDSet(
         user_id=CLARIFAI_USER_ID, app_id=CLARIFAI_APP_ID
     )
 
     post_model_outputs_response = stub.PostModelOutputs(
-        service_pb2.PostModelOutputsRequest(
+        clarifai_grpc.grpc.api.service_pb2.PostModelOutputsRequest(
             user_app_id=userDataObject,
             model_id=CLARIFAI_MODEL_ID,
             version_id=CLARIFAI_MODEL_VERSION_ID,
             inputs=[
-                resources_pb2.Input(
-                    data=resources_pb2.Data(text=resources_pb2.Text(raw=transcript))
+                clarifai_grpc.grpc.api.resources_pb2.Input(
+                    data=clarifai_grpc.grpc.api.resources_pb2.Data(
+                        text=clarifai_grpc.grpc.api.resources_pb2.Text(raw=transcript)
+                    )
                 )
             ],
         ),
         metadata=metadata,
     )
 
-    if post_model_outputs_response.status.code != status_code_pb2.SUCCESS:
+    if (
+        post_model_outputs_response.status.code
+        != clarifai_grpc.grpc.api.status.status_code_pb2.SUCCESS
+    ):
         print(post_model_outputs_response.status)
         status = post_model_outputs_response.status.description
         raise Exception(f"Post model outputs failed, status: {status}")
diff --git a/src/interviewkit/transcript_using_m5.py b/src/interviewkit/transcript_using_m5.py
index 032577a..ab91b33 100644
--- a/src/interviewkit/transcript_using_m5.py
+++ b/src/interviewkit/transcript_using_m5.py
@@ -1,22 +1,27 @@
 import sys
 from pathlib import Path
 
+import whisper
+from pydantic import BaseModel
 from rich.console import Console
-from transformers import T5ForConditionalGeneration, T5Tokenizer
+from whisper.utils import get_writer
 
 
+console = Console()
+
 try:
-    import whisper
+    import transformers
+
+    # Load T5 model and tokenizer
+    tokenizer = transformers.T5Tokenizer.from_pretrained("t5-base")
+    model = transformers.T5ForConditionalGeneration.from_pretrained("t5-base")
 except ImportError:
-    print("Please install Whisper: pip install openai-whisper")
+    console.print(
+        "Please install transformers: pip install 'historyaitoolkit\[transformers]'",
+        style="bold red",
+    )
     exit(1)
 
-from pydantic import BaseModel
-from whisper.utils import get_writer
-
-
-console = Console()
-
 
 class Transcript(BaseModel):
     """The Transcript entity represents the transcript of an interview."""
@@ -24,11 +29,6 @@ class Transcript(BaseModel):
     content: str
 
 
-# Load T5 model and tokenizer
-tokenizer = T5Tokenizer.from_pretrained("t5-base")
-model = T5ForConditionalGeneration.from_pretrained("t5-base")
-
-
 def chunk_text(text, max_length):
     """Split the text into chunks of max_length."""
     words = text.split()
diff --git a/tests/test_cli_version.py b/tests/code/test_cli_version.py
similarity index 100%
rename from tests/test_cli_version.py
rename to tests/code/test_cli_version.py
diff --git a/tests/test_install_package.py b/tests/llms/test_install_package.py
similarity index 100%
rename from tests/test_install_package.py
rename to tests/llms/test_install_package.py