From 4a66e98638951c14a7d93fd9ec95abf9dbbbe64b Mon Sep 17 00:00:00 2001 From: Daniel Roy Greenfeld <62857+pydanny@users.noreply.github.com> Date: Mon, 30 Oct 2023 17:05:36 +0000 Subject: [PATCH] Break out LLMs into their own dependency and tests This will speed up tests and also allows for experimental efforts like using transformers. --- .github/workflows/ci-llms.yml | 44 ++++++++++++++++++++++++ .github/workflows/ci.yml | 7 ++-- CONTRIBUTING.md | 14 ++++++++ Makefile | 5 +++ pyproject.toml | 10 ++++-- src/interviewkit/questions.py | 36 +++++++++++++------ src/interviewkit/transcript_using_m5.py | 28 +++++++-------- tests/{ => code}/test_cli_version.py | 0 tests/{ => llms}/test_install_package.py | 0 9 files changed, 113 insertions(+), 31 deletions(-) create mode 100644 .github/workflows/ci-llms.yml rename tests/{ => code}/test_cli_version.py (100%) rename tests/{ => llms}/test_install_package.py (100%) diff --git a/.github/workflows/ci-llms.yml b/.github/workflows/ci-llms.yml new file mode 100644 index 0000000..7690b9b --- /dev/null +++ b/.github/workflows/ci-llms.yml @@ -0,0 +1,44 @@ +name: Project Tests with LLMS + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Free up disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[test]' + pip install -e '.[clarifai_grpc]' + pip install -e '.[transformers]' + + - name: Install ffmpeg + run: | + sudo apt-get update + sudo apt-get install -y ffmpeg + + - name: Run code and llm tests + run: | + make test_llms \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba989a8..7da3cfa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,9 +37,6 @@ jobs: sudo apt-get update sudo apt-get install -y ffmpeg - - name: Find and run pytest tests + - name: Run code and non-llm tests run: | - test_files=$(find ./tests/ -name "*test*.py") - - # Run pytest on all found test files - python -m pytest $test_files + make test \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fbec2a0..8b28bb2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -133,6 +133,20 @@ ruff check . --fix ruff format . ``` +#### How to run tests + +To run the non-LLM tests (fast), run at the project root: + +```sh +pytest tests/code/*.py +``` + +To run the tests with LLMs, run at the project root: + +```sh +pytest tests/llms/*.py +``` + #### How Do I Submit a Good Enhancement Suggestion? Enhancement suggestions are tracked as [GitHub issues](https://github.com/historysciencelab/HistoryAIToolkit/issues). diff --git a/Makefile b/Makefile index bedc779..e642343 100644 --- a/Makefile +++ b/Makefile @@ -4,3 +4,8 @@ format: lint: ruff check . --fix +test: + pytest tests/code/*.py + +test_llms: + pytest tests/llms/*.py \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 82b4c5d..bca652e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,13 +18,11 @@ classifiers = [ ] license = {text = "GNU General Public License v3"} dependencies = [ - "clarifai_grpc==9.9.1", "openai-whisper==20230918", "pydantic==2.4.2", "pydantic-settings==2.0.3", "pydub==0.25.1", "rich==13.6.0", - "transformers==4.34.0", "typer==0.9.0", "yarl==1.9.2" ] @@ -49,6 +47,14 @@ docs = [ "mkdocs-include-markdown-plugin" ] +clarifai_grpc = [ + "clarifai_grpc==9.9.1" +] + +transformers = [ + "transformers==4.34.0" +] + [project.urls] bugs = "https://github.com/audreyfeldroy/historyaitoolkit/issues" diff --git a/src/interviewkit/questions.py b/src/interviewkit/questions.py index f7546c8..030bf6a 100644 --- a/src/interviewkit/questions.py +++ b/src/interviewkit/questions.py @@ -1,8 +1,13 @@ -from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel -from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc -from clarifai_grpc.grpc.api.status import status_code_pb2 +from rich.console import Console +console = Console() + +try: + import clarifai_grpc +except ImportError: + clarifai_grpc = None + # # Securely get your credentials # TODO: Pass in arguments or use env vars CLARIFAI_PAT = "" @@ -16,29 +21,40 @@ def generate_questions_from_transcript(transcript: str): - channel = ClarifaiChannel.get_grpc_channel() - stub = service_pb2_grpc.V2Stub(channel) + if clarifai_grpc is None: + console.print( + "Please install clarifai-grpc: pip install 'historyaitoolkit\[clarifai]'", + style="bold red", + ) + exit(1) + channel = clarifai_grpc.channel.clarifai_channel.ClarifaiChannel.get_grpc_channel() + stub = clarifai_grpc.grpc.api.service_pb2_grpc.V2Stub(channel) metadata = (("authorization", "Key " + CLARIFAI_PAT),) - userDataObject = resources_pb2.UserAppIDSet( + userDataObject = clarifai_grpc.grpc.api.resources_pb2.UserAppIDSet( user_id=CLARIFAI_USER_ID, app_id=CLARIFAI_APP_ID ) post_model_outputs_response = stub.PostModelOutputs( - service_pb2.PostModelOutputsRequest( + clarifai_grpc.grpc.api.service_pb2.PostModelOutputsRequest( user_app_id=userDataObject, model_id=CLARIFAI_MODEL_ID, version_id=CLARIFAI_MODEL_VERSION_ID, inputs=[ - resources_pb2.Input( - data=resources_pb2.Data(text=resources_pb2.Text(raw=transcript)) + clarifai_grpc.grpc.api.resources_pb2.Input( + data=clarifai_grpc.grpc.api.resources_pb2.Data( + text=clarifai_grpc.grpc.api.resources_pb2.Text(raw=transcript) + ) ) ], ), metadata=metadata, ) - if post_model_outputs_response.status.code != status_code_pb2.SUCCESS: + if ( + post_model_outputs_response.status.code + != clarifai_grpc.grpc.api.status.status_code_pb2.SUCCESS + ): print(post_model_outputs_response.status) status = post_model_outputs_response.status.description raise Exception(f"Post model outputs failed, status: {status}") diff --git a/src/interviewkit/transcript_using_m5.py b/src/interviewkit/transcript_using_m5.py index 032577a..ab91b33 100644 --- a/src/interviewkit/transcript_using_m5.py +++ b/src/interviewkit/transcript_using_m5.py @@ -1,22 +1,27 @@ import sys from pathlib import Path +import whisper +from pydantic import BaseModel from rich.console import Console -from transformers import T5ForConditionalGeneration, T5Tokenizer +from whisper.utils import get_writer +console = Console() + try: - import whisper + import transformers + + # Load T5 model and tokenizer + tokenizer = transformers.T5Tokenizer.from_pretrained("t5-base") + model = transformers.T5ForConditionalGeneration.from_pretrained("t5-base") except ImportError: - print("Please install Whisper: pip install openai-whisper") + console.print( + "Please install transformers: pip install 'historyaitoolkit\[transformers]'", + style="bold red", + ) exit(1) -from pydantic import BaseModel -from whisper.utils import get_writer - - -console = Console() - class Transcript(BaseModel): """The Transcript entity represents the transcript of an interview.""" @@ -24,11 +29,6 @@ class Transcript(BaseModel): content: str -# Load T5 model and tokenizer -tokenizer = T5Tokenizer.from_pretrained("t5-base") -model = T5ForConditionalGeneration.from_pretrained("t5-base") - - def chunk_text(text, max_length): """Split the text into chunks of max_length.""" words = text.split() diff --git a/tests/test_cli_version.py b/tests/code/test_cli_version.py similarity index 100% rename from tests/test_cli_version.py rename to tests/code/test_cli_version.py diff --git a/tests/test_install_package.py b/tests/llms/test_install_package.py similarity index 100% rename from tests/test_install_package.py rename to tests/llms/test_install_package.py