From 52099b68fd9588692afd04a745cc33e518c17780 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Fri, 16 Aug 2024 13:04:38 -0400 Subject: [PATCH 001/112] Restructure as python package (#19) * add init to panza to turn it into a package * add pyproject.toml but no dependencies yet * add the rest of the panzamail dependencies :) * install dependencies based on pyproject.toml instead of raw pip and conda commands --- prepare_env.sh | 9 ++------- pyproject.toml | 30 ++++++++++++++++++++++++++++++ src/panza/__init__.py | 0 3 files changed, 32 insertions(+), 7 deletions(-) create mode 100644 pyproject.toml create mode 100644 src/panza/__init__.py diff --git a/prepare_env.sh b/prepare_env.sh index b6e6972..c730020 100644 --- a/prepare_env.sh +++ b/prepare_env.sh @@ -4,10 +4,5 @@ trap 'trap - ERR RETURN; kill -INT $$ ; return' ERR RETURN conda create --name panza python=3.10 -y conda activate panza -conda install pytorch==2.2.2 torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y - -pip install langdetect langchain langchain-community sentence-transformers faiss-cpu fire mauve-text evaluate torchmetrics gradio cmake packaging nltk - -pip install git+https://github.com/IST-DASLab/llm-foundry -pip install git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant -pip install spops_sm_80 +# install dependencies based on pyproject.toml +pip install -e . \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d45c5ec --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[project] +name = "panza_mail" +version = "2024.08.14" +description = "A personal email assistant, trained and running on-device." +dependencies = [ + "torch==2.2.2", + "langdetect", + "langchain", + "langchain-community", + "sentence-transformers", + "faiss-cpu", + "fire", + "mauve-text", + "evaluate", + "torchmetrics", + "gradio", + "cmake", + "packaging", + "nltk", + "llm-foundry@git+https://github.com/IST-DASLab/llm-foundry", + "peft@git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant_looser_versioning", + "spops-sm-80" +] + +[build-system] +requires = ["setuptools >= 61.0.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] \ No newline at end of file diff --git a/src/panza/__init__.py b/src/panza/__init__.py new file mode 100644 index 0000000..e69de29 From 053dc09a419331bb69af743debb1568e3886f7a7 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Fri, 16 Aug 2024 21:25:24 +0300 Subject: [PATCH 002/112] Add Ollama inference --- pyproject.toml | 1 + scripts/run_panza_ollama.sh | 31 +++++++++ src/panza/evaluation/ollama_inference.py | 83 ++++++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100755 scripts/run_panza_ollama.sh create mode 100644 src/panza/evaluation/ollama_inference.py diff --git a/pyproject.toml b/pyproject.toml index d45c5ec..515b6b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "cmake", "packaging", "nltk", + "ollama", "llm-foundry@git+https://github.com/IST-DASLab/llm-foundry", "peft@git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant_looser_versioning", "spops-sm-80" diff --git a/scripts/run_panza_ollama.sh b/scripts/run_panza_ollama.sh new file mode 100755 index 0000000..d63dce8 --- /dev/null +++ b/scripts/run_panza_ollama.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +source config.sh + +MODEL=${PANZA_GENERATIVE_MODEL} # Replace this with the checkpoint you want to use! + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + +USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") +USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") + +INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/ollama_inference.py +python ${INFERENCE_SCRIPT} \ + --model=llama3.1 \ + --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ + --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ + --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ + --embedding-model=${PANZA_EMBEDDING_MODEL} \ + --db-path=${PANZA_DATA_DIR} \ + --index-name=${PANZA_USERNAME} \ + --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ + ${USE_RAG} \ + ${USE_4BIT_QUANT} diff --git a/src/panza/evaluation/ollama_inference.py b/src/panza/evaluation/ollama_inference.py new file mode 100644 index 0000000..a0d7a84 --- /dev/null +++ b/src/panza/evaluation/ollama_inference.py @@ -0,0 +1,83 @@ +import os +import sys + +import ollama +import torch + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.evaluation import base_inference +from panza.utils import prompting, rag +from panza.utils.documents import Email + +sys.path.pop(0) + + +def get_response_stream(prompt: str, model: str): + stream = ollama.chat( + model=model, + messages=[{"role": "user", "content": prompt}], + stream=True, + ) + + return stream + + +def print_response_stream(stream): + for chunk in stream: + print(chunk["message"]["content"], end="", flush=True) + + + +def main(): + parser = base_inference.get_base_inference_args_parser() + args = parser.parse_args() + + print("Running inference with args:", args) + + if args.nthreads is not None: + torch.set_num_threads(args.nthreads) + + + if args.use_rag: + embeddings_model = rag.get_embeddings_model(args.embedding_model) + db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) + + system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( + args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble + ) + + while True: + instruction = input("Enter another request (or 'quit' to exit): ") + + if instruction.lower() == "quit": + print("Exiting...") + break + + relevant_emails = [] + if args.use_rag: + assert db is not None, "RAG requires a database to be provided." + re = db._similarity_search_with_relevance_scores(instruction, k=args.rag_num_emails) + relevant_emails = [ + Email.deserialize(r[0].metadata["serialized_email"]) + for r in re + if r[1] >= args.rag_relevance_threshold + ] + + prompt = prompting.create_prompt( + instruction, + system_preamble, + user_preamble, + rag_preamble, + relevant_emails, + ) + + print("Running with prompt:", prompt) + + args.model = "llama3.1" + stream = get_response_stream(prompt, args.model) + print_response_stream(stream) + + +if __name__ == "__main__": + main() From 0e8eb89c9202104052af58a50fed9430e892d4eb Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:34:00 -0400 Subject: [PATCH 003/112] Panza Web Server (#12) * Add Ollama inference * expose Panza as a web server * add api keys to env variables and check in server * check api key * switch to fastapi to prevent model reloading * Add ollama-backed streaming HTTP server --------- Co-authored-by: Armand Nicolicioiu --- .env | 2 + scripts/run_ollama_services.sh | 36 +++++++ scripts/run_services.sh | 36 +++++++ .../evaluation/ollama_service_inference.py | 77 +++++++++++++ src/panza/evaluation/service_inference.py | 101 ++++++++++++++++++ 5 files changed, 252 insertions(+) create mode 100644 .env create mode 100755 scripts/run_ollama_services.sh create mode 100755 scripts/run_services.sh create mode 100644 src/panza/evaluation/ollama_service_inference.py create mode 100644 src/panza/evaluation/service_inference.py diff --git a/.env b/.env new file mode 100644 index 0000000..0605d28 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +# Store API keys here +API_KEYS=apikey1,apikey2,apikey3 \ No newline at end of file diff --git a/scripts/run_ollama_services.sh b/scripts/run_ollama_services.sh new file mode 100755 index 0000000..61857be --- /dev/null +++ b/scripts/run_ollama_services.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +source config.sh + +MODEL="custom" + +DEVICE="cuda:1" +DTYPE="bf16" + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + +USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") +USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") + +INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/ollama_service_inference.py +python ${INFERENCE_SCRIPT} \ + --model=${MODEL} \ + --device=${DEVICE} \ + --dtype=${DTYPE} \ + --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ + --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ + --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ + --embedding-model=${PANZA_EMBEDDING_MODEL} \ + --db-path=${PANZA_DATA_DIR} \ + --index-name=${PANZA_USERNAME} \ + --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ + ${USE_RAG} \ + ${USE_4BIT_QUANT} \ No newline at end of file diff --git a/scripts/run_services.sh b/scripts/run_services.sh new file mode 100755 index 0000000..0f7122a --- /dev/null +++ b/scripts/run_services.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +source config.sh + +MODEL="../checkpoints/models/panza_seanyang711_llama3_bf16-bs8-rosa_wl16_d0.01_1grads_mean_squared_r8_loralr1e-5_alpha16-lr1e-5-epochs5-wu8-seed42-PREAMBLE-16296" + +DEVICE="cuda:1" +DTYPE="bf16" + +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + +USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") +USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") + +INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/service_inference.py +python ${INFERENCE_SCRIPT} \ + --model=${MODEL} \ + --device=${DEVICE} \ + --dtype=${DTYPE} \ + --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ + --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ + --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ + --embedding-model=${PANZA_EMBEDDING_MODEL} \ + --db-path=${PANZA_DATA_DIR} \ + --index-name=${PANZA_USERNAME} \ + --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ + ${USE_RAG} \ + ${USE_4BIT_QUANT} \ No newline at end of file diff --git a/src/panza/evaluation/ollama_service_inference.py b/src/panza/evaluation/ollama_service_inference.py new file mode 100644 index 0000000..2831536 --- /dev/null +++ b/src/panza/evaluation/ollama_service_inference.py @@ -0,0 +1,77 @@ +import os +import sys +from typing import Annotated + +from fastapi import FastAPI, HTTPException, Header +from fastapi.responses import StreamingResponse +import uvicorn +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel +from dotenv import load_dotenv + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.evaluation import base_inference +from panza.utils import prompting +from panza.evaluation import ollama_inference + +class Request(BaseModel): + text: str + +sys.path.pop(0) + +app = FastAPI() + +origins = [ + "https://mail.google.com", +] + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Load environment variables from the .env file +load_dotenv() +valid_api_keys = os.getenv("API_KEYS").split(",") + +parser = base_inference.get_base_inference_args_parser() +args = parser.parse_args() + +print("Running inference with args:", args) + +system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( + args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble +) + +def predict(user_input): + relevant_emails = [] + prompt = prompting.create_prompt( + user_input, + system_preamble, + user_preamble, + rag_preamble, + relevant_emails, + ) + return ollama_inference.get_response_stream(prompt, args.model) + +def streamer(stream): + for chunk in stream: + yield chunk["message"]["content"] + +@app.options('/generate') +def options(): + return {"methods": ["POST"]} + +@app.post('/generate') +def generate_text(request: Request, x_api_key: Annotated[str | None, Header()] = None): + if x_api_key not in valid_api_keys: + raise HTTPException(status_code=401, detail="Invalid API key.") + stream = predict(request.text) + return StreamingResponse(streamer(stream), media_type='text/event-stream') + +if __name__ == '__main__': + uvicorn.run(app, host='0.0.0.0', port=5001) \ No newline at end of file diff --git a/src/panza/evaluation/service_inference.py b/src/panza/evaluation/service_inference.py new file mode 100644 index 0000000..80ae230 --- /dev/null +++ b/src/panza/evaluation/service_inference.py @@ -0,0 +1,101 @@ +import os +import sys +from typing import Annotated + +import torch + +from fastapi import FastAPI, HTTPException, Header +import uvicorn +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel +from dotenv import load_dotenv + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.evaluation import base_inference +from panza.utils import prompting, rag + +class Request(BaseModel): + text: str + +class Response(BaseModel): + generated_text: str + +sys.path.pop(0) + +app = FastAPI() + +origins = [ + "https://mail.google.com", +] + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Load environment variables from the .env file +load_dotenv() +valid_api_keys = os.getenv("API_KEYS").split(",") + +parser = base_inference.get_base_inference_args_parser() +args = parser.parse_args() + +print("Running inference with args:", args) + +if args.nthreads is not None: + torch.set_num_threads(args.nthreads) + +print("Loading model ", args.model) +model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) + +if args.use_rag: + embeddings_model = rag.get_embeddings_model(args.embedding_model) + db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) + +system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( + args.system_preamble, args.user_preamble, args.rag_preamble +) + +def predict(user_input): + prompts, outputs = base_inference.run_inference( + instructions=[user_input], + model=model, + tokenizer=tokenizer, + system_preamble=system_preamble, + user_preamble=user_preamble, + rag_preamble=rag_preamble, + rag_relevance_threshold=args.rag_relevance_threshold, + rag_num_emails=args.rag_num_emails, + use_rag=args.use_rag, + db=db if args.use_rag else None, + max_new_tokens=args.max_new_tokens, + best=args.best, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + device=args.device, + ) + + print("Processed input:", prompts[0]) + print("Generated email", outputs[0]) + + return outputs[0] + +@app.options('/generate') +def options(): + return {"methods": ["POST"]} + +@app.post('/generate') +def generate_text(request: Request, x_api_key: Annotated[str | None, Header()] = None): + if x_api_key not in valid_api_keys: + raise HTTPException(status_code=401, detail="Invalid API key, must be one of: " + str(valid_api_keys)) + generated_text = predict(request.text) + return {"generated_text": generated_text} + + +if __name__ == '__main__': + uvicorn.run(app, host='0.0.0.0', port=5000) \ No newline at end of file From 498fdc13220ef126231d0f95d0275d9e6c3f250f Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 20 Aug 2024 13:06:51 +0200 Subject: [PATCH 004/112] Add black line length configuration --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 515b6b8..09da1d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,4 +28,7 @@ requires = ["setuptools >= 61.0.0"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -where = ["src"] \ No newline at end of file +where = ["src"] + +[tool.black] +line-length = 100 \ No newline at end of file From aec8d50a6bb8f7e770aebbd9d95d048e5b598bee Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 20 Aug 2024 19:39:10 +0200 Subject: [PATCH 005/112] Add new Panza src path --- src/panza3/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/panza3/__init__.py diff --git a/src/panza3/__init__.py b/src/panza3/__init__.py new file mode 100644 index 0000000..e69de29 From 39d37e2765ac9a4452ee742478959320563f7c25 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Thu, 22 Aug 2024 10:31:59 +0200 Subject: [PATCH 006/112] Create class interfaces --- src/panza3/entities/__init__.py | 4 ++++ .../entities/document.py} | 0 src/panza3/entities/instruction.py | 16 ++++++++++++++++ src/panza3/llm/__init__.py | 3 +++ src/panza3/llm/base.py | 18 ++++++++++++++++++ src/panza3/prompting/__init__.py | 3 +++ src/panza3/prompting/base.py | 13 +++++++++++++ src/panza3/retriever/__init__.py | 3 +++ src/panza3/retriever/base.py | 16 ++++++++++++++++ src/panza3/writer/__init__.py | 0 src/panza3/writer/base.py | 15 +++++++++++++++ 11 files changed, 91 insertions(+) create mode 100644 src/panza3/entities/__init__.py rename src/{panza/utils/documents.py => panza3/entities/document.py} (100%) create mode 100644 src/panza3/entities/instruction.py create mode 100644 src/panza3/llm/__init__.py create mode 100644 src/panza3/llm/base.py create mode 100644 src/panza3/prompting/__init__.py create mode 100644 src/panza3/prompting/base.py create mode 100644 src/panza3/retriever/__init__.py create mode 100644 src/panza3/retriever/base.py create mode 100644 src/panza3/writer/__init__.py create mode 100644 src/panza3/writer/base.py diff --git a/src/panza3/entities/__init__.py b/src/panza3/entities/__init__.py new file mode 100644 index 0000000..f306513 --- /dev/null +++ b/src/panza3/entities/__init__.py @@ -0,0 +1,4 @@ +from .document import Document, Email +from .instruction import EmailInstruction, Instruction + +__all__ = ["Document", "Email", "EmailInstruction", "Instruction"] diff --git a/src/panza/utils/documents.py b/src/panza3/entities/document.py similarity index 100% rename from src/panza/utils/documents.py rename to src/panza3/entities/document.py diff --git a/src/panza3/entities/instruction.py b/src/panza3/entities/instruction.py new file mode 100644 index 0000000..449b4e3 --- /dev/null +++ b/src/panza3/entities/instruction.py @@ -0,0 +1,16 @@ +from abc import ABC +from dataclasses import dataclass +from typing import List + +from ..llm import ChatHistoryType + + +@dataclass +class Instruction(ABC): + instruction: str + past_messages: ChatHistoryType + + +@dataclass(kw_only=True) +class EmailInstruction(Instruction): + thread: List[str] diff --git a/src/panza3/llm/__init__.py b/src/panza3/llm/__init__.py new file mode 100644 index 0000000..cc64f95 --- /dev/null +++ b/src/panza3/llm/__init__.py @@ -0,0 +1,3 @@ +from .base import LLM, ChatHistoryType, MessageType + +__all__ = ["LLM", "ChatHistoryType", "MessageType"] diff --git a/src/panza3/llm/base.py b/src/panza3/llm/base.py new file mode 100644 index 0000000..3bcad4f --- /dev/null +++ b/src/panza3/llm/base.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod +from typing import Dict, Iterator, List, Literal + +MessageType = Dict[Literal["role", "content"], str] +ChatHistoryType = List[MessageType] + + +class LLM(ABC): + def __init__(self, name: str): + self.name = name + + @abstractmethod + def chat(self, messages: ChatHistoryType) -> str: + pass + + @abstractmethod + def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: + pass diff --git a/src/panza3/prompting/__init__.py b/src/panza3/prompting/__init__.py new file mode 100644 index 0000000..3b89e60 --- /dev/null +++ b/src/panza3/prompting/__init__.py @@ -0,0 +1,3 @@ +from .base import PromptBuilder + +__all__ = ["PromptBuilder"] diff --git a/src/panza3/prompting/base.py b/src/panza3/prompting/base.py new file mode 100644 index 0000000..713b5f1 --- /dev/null +++ b/src/panza3/prompting/base.py @@ -0,0 +1,13 @@ +from abc import ABC, abstractmethod + +from ..entities import Instruction +from ..retriever import DocumentRetriever + + +class PromptBuilder(ABC): + def __init__(self, retriever: DocumentRetriever): + self.retriever = retriever + + @abstractmethod + def build_prompt(self, instruction: Instruction) -> str: + pass diff --git a/src/panza3/retriever/__init__.py b/src/panza3/retriever/__init__.py new file mode 100644 index 0000000..c01b978 --- /dev/null +++ b/src/panza3/retriever/__init__.py @@ -0,0 +1,3 @@ +from .base import DocumentRetriever + +__all__ = ["DocumentRetriever"] diff --git a/src/panza3/retriever/base.py b/src/panza3/retriever/base.py new file mode 100644 index 0000000..6e12af0 --- /dev/null +++ b/src/panza3/retriever/base.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod +from typing import List, Optional, Tuple + +from ..entities.document import Document + + +class DocumentRetriever(ABC): + @abstractmethod + def retrieve(self, query: str, k: int, score: Optional[float] = None) -> List[Document]: + pass + + @abstractmethod + def retrieve_with_score( + self, query: str, k: int, score: Optional[float] = None + ) -> List[Tuple[Document, float]]: + pass diff --git a/src/panza3/writer/__init__.py b/src/panza3/writer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/panza3/writer/base.py b/src/panza3/writer/base.py new file mode 100644 index 0000000..5f767df --- /dev/null +++ b/src/panza3/writer/base.py @@ -0,0 +1,15 @@ +from abc import ABC, abstractmethod + +from ..entities import Instruction +from ..llm import LLM +from ..prompting import PromptBuilder + + +class PanzaWriter(ABC): + def __init__(self, prompt_builder: PromptBuilder, llm: LLM): + self.prompt_builder = prompt_builder + self.llm = llm + + @abstractmethod + def run(self, instruction: Instruction) -> str: + pass From 769277ca5d5027f31a38750bdf2956ee3b6582c0 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Fri, 23 Aug 2024 10:59:22 +0200 Subject: [PATCH 007/112] Set up unit testing --- pyproject.toml | 5 ++++- tests/test_entities.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 tests/test_entities.py diff --git a/pyproject.toml b/pyproject.toml index 09da1d4..5f50131 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,4 +31,7 @@ build-backend = "setuptools.build_meta" where = ["src"] [tool.black] -line-length = 100 \ No newline at end of file +line-length = 100 + +[tool.pytest.ini_options] +pythonpath = ["src"] \ No newline at end of file diff --git a/tests/test_entities.py b/tests/test_entities.py new file mode 100644 index 0000000..b486053 --- /dev/null +++ b/tests/test_entities.py @@ -0,0 +1,13 @@ +import json +from datetime import datetime + +import pytest + +from panza3.entities import Email + + +def test_email_serialization_deserialization(): + email = Email(email="email", subject="subject", thread=["thread"], date=datetime.now()) + serialized = json.dumps(email.serialize()) + deserialized = Email.deserialize(serialized) + assert email == deserialized From 42acc78dc35d460fd3859f60f16cc643eb2fade5 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Fri, 23 Aug 2024 14:35:13 +0200 Subject: [PATCH 008/112] web hosting --- src/panza3/hosting/__init__.py | 3 ++ src/panza3/hosting/web_service.py | 62 +++++++++++++++++++++++++++++++ src/panza3/run_panza.py | 4 ++ 3 files changed, 69 insertions(+) create mode 100644 src/panza3/hosting/__init__.py create mode 100644 src/panza3/hosting/web_service.py create mode 100644 src/panza3/run_panza.py diff --git a/src/panza3/hosting/__init__.py b/src/panza3/hosting/__init__.py new file mode 100644 index 0000000..b5367f4 --- /dev/null +++ b/src/panza3/hosting/__init__.py @@ -0,0 +1,3 @@ +from .web_service import PanzaWebService + +__all__ = ["PanzaWebService"] \ No newline at end of file diff --git a/src/panza3/hosting/web_service.py b/src/panza3/hosting/web_service.py new file mode 100644 index 0000000..e41c48a --- /dev/null +++ b/src/panza3/hosting/web_service.py @@ -0,0 +1,62 @@ +import os +from typing import Annotated, Generator, List +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi import FastAPI, HTTPException, Header +from fastapi.responses import StreamingResponse +import uvicorn +from pydantic import BaseModel +from dotenv import load_dotenv +import threading + +class Request(BaseModel): + text: str + +class PanzaWebService: + DEFAULT_PORT = 5001 + + def __init__(self, port=DEFAULT_PORT): + self.app = FastAPI() + self.port = port + self._setup_routes() + load_dotenv() + self._add_cors() + self.api_keys = self._get_valid_api_keys() + self._start_server() + + def _add_cors(self): + self.app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + def _get_valid_api_keys(self) -> List[str]: + return os.getenv("API_KEYS").split(",") + + def _streamer(self, stream): + for chunk in stream: + yield chunk["message"]["content"] + + def _predict(self, input: str) -> Generator: + # TODO: Call PanzaWriter here + # Dummy generator + for i in range(10): + yield {"message": {"content": f"Generated text {i}"}} + + def _setup_routes(self): + @self.app.options('/generate') + def options(): + return {"methods": ["POST"]} + + @self.app.post('/generate') + def generate_text(request: Request, x_api_key: Annotated[str | None, Header()]): + if x_api_key not in self.api_keys: + raise HTTPException(status_code=401, detail="Invalid API key.") + stream = self._predict(request.text) + return StreamingResponse(self._streamer(stream), media_type='text/event-stream') + + def _start_server(self): + uvicorn.run(self.app, port=self.port) \ No newline at end of file diff --git a/src/panza3/run_panza.py b/src/panza3/run_panza.py new file mode 100644 index 0000000..918d4d6 --- /dev/null +++ b/src/panza3/run_panza.py @@ -0,0 +1,4 @@ +from hosting import PanzaWebService + +# create a new PanzaWebService +service = PanzaWebService() \ No newline at end of file From 081bf946d77a2972397a4e67859db4cb82477f5e Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Fri, 23 Aug 2024 15:32:23 +0200 Subject: [PATCH 009/112] implement ollama llm --- src/panza3/llm/__init__.py | 3 +- src/panza3/llm/ollama_llm.py | 68 ++++++++++++++++++++++++++++++++++++ src/panza3/run_panza.py | 16 +++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 src/panza3/llm/ollama_llm.py diff --git a/src/panza3/llm/__init__.py b/src/panza3/llm/__init__.py index cc64f95..c278cd2 100644 --- a/src/panza3/llm/__init__.py +++ b/src/panza3/llm/__init__.py @@ -1,3 +1,4 @@ from .base import LLM, ChatHistoryType, MessageType +from .ollama_llm import OllamaLLM -__all__ = ["LLM", "ChatHistoryType", "MessageType"] +__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM"] diff --git a/src/panza3/llm/ollama_llm.py b/src/panza3/llm/ollama_llm.py new file mode 100644 index 0000000..9676b2a --- /dev/null +++ b/src/panza3/llm/ollama_llm.py @@ -0,0 +1,68 @@ +import os +import ollama + +from .base import LLM + +class OllamaLLM(LLM): + def __init__(self, name: str, gguf_file: str, sampling_params: dict): + """ + Loads and serves the model from the GGUF file into Ollama with the given name and sampling parameters. + """ + super().__init__(name) + self.gguf_file = gguf_file + self.sampling_params = sampling_params + + if not self._is_ollama_running(): + self._start_ollama() + + if not self._is_model_loaded(): + self._load_model() + + def _is_ollama_running(self): + try: + ollama.list() + return True + except: + return False + + def _start_ollama(self): + # run the bash command "ollama list" which causes Ollama to start if it is not already running + try: + os.system("/bin/bash -c 'ollama list'") + except: + raise Exception("Ollama failed to start.") + + def _is_model_loaded(self): + for model in ollama.list()['models']: + # model name is everything before the colon + name = model['name'].split(":")[0] + if name == self.name: + return True + return False + + def _load_model(self): + # TODO: Add sampling parameters to the model file + modelfile = f""" + FROM {self.gguf_file} + """ + try: + ollama.create(model={self.name}, modelfile=modelfile, stream=True) + except: + raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") + + def _get_message(self, response): + return response['message']['content'] + + def chat(self, messages): + response = ollama.chat(model=self.name, messages=messages, stream=False) + return self._get_message(response) + + def chat_stream(self, messages): + stream = ollama.chat( + model=self.name, + messages=messages, + stream=True, + ) + # return a new stream that only contains the message content + for chunk in stream: + yield self._get_message(chunk) \ No newline at end of file diff --git a/src/panza3/run_panza.py b/src/panza3/run_panza.py index 918d4d6..4c21a62 100644 --- a/src/panza3/run_panza.py +++ b/src/panza3/run_panza.py @@ -1,4 +1,20 @@ from hosting import PanzaWebService +from llm import OllamaLLM, ChatHistoryType + +llm = OllamaLLM("custom", "path/to/file", {}) + +messages: ChatHistoryType = [{"role": "user", "content": "Write a one-sentence email saying i will be late to the meeting"}] + +# Example of how to use the LLM with no streaming +stream = llm.chat_stream(messages) +while True: + try: + print(next(stream)) + except StopIteration: + break + +# Example of how to use the LLM with streaming +print(llm.chat(messages)) # create a new PanzaWebService service = PanzaWebService() \ No newline at end of file From de2fe1291ccb044fbbcc3f92edd0651a771b4af7 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sat, 24 Aug 2024 14:24:10 +0200 Subject: [PATCH 010/112] Add FAISS retriever and update Document interface to preepare for indexing --- .../data_preparation/create_vector_store.py | 18 +--- src/panza/utils/rag.py | 34 ------- src/panza3/entities/document.py | 27 ++++++ src/panza3/retriever/__init__.py | 3 +- src/panza3/retriever/base.py | 4 + src/panza3/retriever/faiss.py | 96 +++++++++++++++++++ tests/conftest.py | 46 +++++++++ tests/test_entities.py | 9 ++ tests/test_retriever.py | 66 +++++++++++++ 9 files changed, 251 insertions(+), 52 deletions(-) create mode 100644 src/panza3/retriever/faiss.py create mode 100644 tests/conftest.py create mode 100644 tests/test_retriever.py diff --git a/src/panza/data_preparation/create_vector_store.py b/src/panza/data_preparation/create_vector_store.py index 86c4c7f..20b11c2 100644 --- a/src/panza/data_preparation/create_vector_store.py +++ b/src/panza/data_preparation/create_vector_store.py @@ -3,8 +3,7 @@ import time from typing import List -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_core.documents import Document + from panza.utils import rag from panza.utils.documents import Email @@ -20,21 +19,6 @@ def load_emails(path: str) -> List[Email]: return emails -def process_emails(emails: List[Email], chunk_size: int, chunk_overlap: int) -> List[Document]: - # Convert e-mails to langchain documents - documents = [ - Document(page_content=email.email, metadata={"serialized_email": email.serialize()}) - for email in emails - ] - - # Split long e-mails into text chuncks - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - documents = text_splitter.split_documents(documents) - - return documents - def main(): parser = argparse.ArgumentParser() diff --git a/src/panza/utils/rag.py b/src/panza/utils/rag.py index 5653ed9..b33e9ae 100644 --- a/src/panza/utils/rag.py +++ b/src/panza/utils/rag.py @@ -1,37 +1,3 @@ from typing import List -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain_community.vectorstores import FAISS from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore - - -def get_embeddings_model(model_name) -> Embeddings: - embeddings_model = HuggingFaceEmbeddings( - model_name=model_name, - model_kwargs={"device": "cpu"}, - encode_kwargs={"normalize_embeddings": False}, - ) - return embeddings_model - - -def create_vector_db(docs: List[Document], embeddings_model: Embeddings) -> VectorStore: - db = FAISS.from_documents(docs, embeddings_model) - return db - - -def load_vector_db_from_disk( - folder_path: str, index_name: str, embeddings_model: Embeddings -) -> VectorStore: - try: - db = FAISS.load_local( - folder_path=folder_path, - embeddings=embeddings_model, - index_name=index_name, - allow_dangerous_deserialization=True, # Allows pickle deserialization - ) - print("Faiss index loaded ") - return db - except Exception as e: - print("Fiass index loading failed \n", e) diff --git a/src/panza3/entities/document.py b/src/panza3/entities/document.py index ccd7f85..ff7f0dd 100644 --- a/src/panza3/entities/document.py +++ b/src/panza3/entities/document.py @@ -5,6 +5,9 @@ from datetime import datetime from typing import Dict, List, Optional, Union +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_core.documents import Document as LangchainDocument + @dataclass class Document(ABC): @@ -21,6 +24,14 @@ def deserialize(cls, data: Union[str, Dict]) -> "Document": """Convert a serialized document into a Document object.""" pass + @staticmethod + @abstractmethod + def process( + documents: List["Document"], chunk_size: int, chunk_overlap: int + ) -> List[LangchainDocument]: + """Prepare documents for storage.""" + pass + @dataclass(kw_only=True) class Email(Document): @@ -44,3 +55,19 @@ def deserialize(cls, data: Union[str, Dict]) -> "Email": raise ValueError(f"Cannot deserialize data of type {type(data)}. Must be str or dict.") dictionary["date"] = datetime.fromisoformat(dictionary["date"]) return cls(**dictionary) + + @staticmethod + def process(documents: List["Email"], chunk_size, chunk_overlap) -> List[Document]: + # Convert e-mails to langchain documents + documents = [ + LangchainDocument(page_content=email.email, metadata={"serialized_document": email.serialize()}) + for email in documents + ] + + # Split long e-mails into text chuncks + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + documents = text_splitter.split_documents(documents) + + return documents diff --git a/src/panza3/retriever/__init__.py b/src/panza3/retriever/__init__.py index c01b978..e61fc97 100644 --- a/src/panza3/retriever/__init__.py +++ b/src/panza3/retriever/__init__.py @@ -1,3 +1,4 @@ from .base import DocumentRetriever +from .faiss import FaissRetriever -__all__ = ["DocumentRetriever"] +__all__ = ["DocumentRetriever", "FaissRetriever"] diff --git a/src/panza3/retriever/base.py b/src/panza3/retriever/base.py index 6e12af0..a596f80 100644 --- a/src/panza3/retriever/base.py +++ b/src/panza3/retriever/base.py @@ -14,3 +14,7 @@ def retrieve_with_score( self, query: str, k: int, score: Optional[float] = None ) -> List[Tuple[Document, float]]: pass + + @abstractmethod + def store(self, documents: List[Document]): + pass diff --git a/src/panza3/retriever/faiss.py b/src/panza3/retriever/faiss.py new file mode 100644 index 0000000..7c4e567 --- /dev/null +++ b/src/panza3/retriever/faiss.py @@ -0,0 +1,96 @@ +import logging +from typing import List, Optional, Tuple + +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +from ..entities.document import Document +from .base import DocumentRetriever + +LOGGER = logging.getLogger(__name__) + + +class FaissRetriever(DocumentRetriever): + def __init__( + self, + db_path: str, + index_name: str, + embedding_model: str, + device: str, + document_class: type[Document], + ) -> None: + + self.db_path = db_path + self.index_name = index_name + self.model_name = embedding_model + self.device = device + self.document_class = document_class + + self.embedding_model = self._get_embeddings_model(self.model_name, self.device) + self.db = self._load_vector_db_from_disk( + self.db_path, self.index_name, self.embedding_model + ) + + def _get_embeddings_model(self, model_name: str, device: str) -> Embeddings: + embeddings_model = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs={"device": device}, + encode_kwargs={"normalize_embeddings": False}, + ) + return embeddings_model + + def _load_vector_db_from_disk( + self, db_path: str, index_name: str, embeddings_model: Embeddings + ) -> VectorStore: + try: + db = FAISS.load_local( + folder_path=db_path, + embeddings=embeddings_model, + index_name=index_name, + allow_dangerous_deserialization=True, # Allows pickle deserialization + ) + LOGGER.info(f"Loaded Faiss index {index_name} from {db_path}.") + return db + except Exception as e: + LOGGER.error(f"Failed to load Faiss index {index_name} from {db_path}. Error: {e}") + + def retrieve(self, query: str, k: int, score: Optional[float] = None) -> List[Document]: + results = self.retrieve_with_score(query, k, score) + results = [r[0] for r in results] + return results + + def retrieve_with_score( + self, query: str, k: int, score: Optional[float] = None + ) -> List[Tuple[Document, float]]: + + results = self.db._similarity_search_with_relevance_scores(query, k=k) + + # Filter by score + if score is not None: + results = [r for r in results if r[1] >= score] + + # Deserialize metadata + results = [ + (self.document_class.deserialize(r[0].metadata["serialized_document"]), r[1]) + for r in results + ] + + return results + + def store(self, documents: List[Document], chunk_size: int, chunk_overlap: int): + documents = self.document_class.process( + documents=documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + db = FAISS.from_documents(documents, self.embedding_model) + + if self.db: + self.db.merge_from(db) + else: + self.db = db + + def save_db_to_disk(self): + # Save vector DB to disk + self.db.save_local(folder_path=self.db_path, index_name=self.index_name) + logging.info(f"Vector DB index {self.index_name} saved to {self.db_path}.") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9e62025 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,46 @@ +from datetime import datetime +from pathlib import Path + +import pytest + +from panza3.entities import Email +from panza3.retriever import FaissRetriever + + +@pytest.fixture +def embedding_model() -> str: + return "sentence-transformers/all-mpnet-base-v2" + + +@pytest.fixture +def index_name() -> str: + return "test-index" + + +@pytest.fixture(scope="function") +def faiss_db_path(tmp_path: Path, index_name: str, embedding_model: str) -> Path: + # Create a new temporary directory for each test + base_temp_dir = tmp_path / "data" + base_temp_dir.mkdir() # Ensure the data directory is created + + # Define the mock emails + emails = [ + Email(email=f"email{i}", subject=f"subject{i}", thread=[f"thread{i}"], date=datetime.now()) + for i in range(3) + ] + + # Initialize the FaissRetriever + retriever = FaissRetriever( + db_path=base_temp_dir, + index_name=index_name, + embedding_model=embedding_model, + device="cpu", + document_class=Email, + ) + + # Store the mock emails in the vector database + retriever.store(emails, chunk_size=1000, chunk_overlap=1000) + retriever.save_db_to_disk() + + # Return the path to the directory containing all mock data + return base_temp_dir diff --git a/tests/test_entities.py b/tests/test_entities.py index b486053..9740da0 100644 --- a/tests/test_entities.py +++ b/tests/test_entities.py @@ -11,3 +11,12 @@ def test_email_serialization_deserialization(): serialized = json.dumps(email.serialize()) deserialized = Email.deserialize(serialized) assert email == deserialized + + +def test_email_processing(): + email = Email(email="email", subject="subject", thread=["thread"], date=datetime.now()) + processed = Email.process([email], chunk_size=1000, chunk_overlap=1000) + assert processed[0].page_content == email.email + + deserialized = Email.deserialize(processed[0].metadata["serialized_document"]) + assert email == deserialized diff --git a/tests/test_retriever.py b/tests/test_retriever.py new file mode 100644 index 0000000..8d7af0a --- /dev/null +++ b/tests/test_retriever.py @@ -0,0 +1,66 @@ +from datetime import datetime +from pathlib import Path + +import pytest + +from panza3.entities import Email +from panza3.retriever import FaissRetriever + + +def get_faiss_retriever( + db_path: Path, index_name: str, embedding_model: str, device: str +) -> FaissRetriever: + return FaissRetriever( + db_path=db_path, + index_name=index_name, + embedding_model=embedding_model, + device=device, + document_class=Email, + ) + + +def test_faiss_retriever_init_empty(tmp_path: Path, index_name: str, embedding_model: str): + retriever = get_faiss_retriever(tmp_path, index_name, embedding_model, "cpu") + assert retriever is not None + assert retriever.embedding_model is not None + assert retriever.db is None + + +def test_faiss_retriever_init_existing(faiss_db_path: Path, index_name: str, embedding_model: str): + retriever = get_faiss_retriever(faiss_db_path, index_name, embedding_model, "cpu") + assert retriever is not None + assert retriever.embedding_model is not None + assert retriever.db is not None + + +def test_faiss_retriever_store_over_empty(tmp_path: Path, index_name: str, embedding_model: str): + retriever = get_faiss_retriever(tmp_path, index_name, embedding_model, "cpu") + + emails = [ + Email(email=f"email{i}", subject=f"subject{i}", thread=[f"thread{i}"], date=datetime.now()) + for i in range(3) + ] + + retriever.store(emails, chunk_size=1000, chunk_overlap=1000) + assert retriever.db is not None + + +def test_faiss_retriever_store_over_existing( + faiss_db_path: Path, index_name: str, embedding_model: str +): + retriever = get_faiss_retriever(faiss_db_path, index_name, embedding_model, "cpu") + assert retriever.db is not None + + number_existing_documents = len(retriever.db.index_to_docstore_id) + assert number_existing_documents != 0 + + number_new_documents = 3 + emails = [ + Email(email=f"email{i}", subject=f"subject{i}", thread=[f"thread{i}"], date=datetime.now()) + for i in range(number_new_documents) + ] + + retriever.store(emails, chunk_size=1000, chunk_overlap=1000) + + number_total_documents = len(retriever.db.index_to_docstore_id) + assert number_total_documents == number_existing_documents + number_new_documents From 32a8a394905c2b376ccdd58550ddb8faca3eb160 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sat, 24 Aug 2024 14:34:58 +0200 Subject: [PATCH 011/112] Fix missing method in retriever interface --- src/panza3/retriever/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/panza3/retriever/base.py b/src/panza3/retriever/base.py index a596f80..05bc775 100644 --- a/src/panza3/retriever/base.py +++ b/src/panza3/retriever/base.py @@ -18,3 +18,7 @@ def retrieve_with_score( @abstractmethod def store(self, documents: List[Document]): pass + + @abstractmethod + def save_db_to_disk(self): + pass From 094fa246b774396f10da569101a4e877db35b3ee Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sat, 24 Aug 2024 17:07:37 +0200 Subject: [PATCH 012/112] Make thread and past_messages optional in EmailInstruction --- src/panza3/entities/instruction.py | 6 +++--- tests/test_entities.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/panza3/entities/instruction.py b/src/panza3/entities/instruction.py index 449b4e3..544f5f5 100644 --- a/src/panza3/entities/instruction.py +++ b/src/panza3/entities/instruction.py @@ -1,5 +1,5 @@ from abc import ABC -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List from ..llm import ChatHistoryType @@ -8,9 +8,9 @@ @dataclass class Instruction(ABC): instruction: str - past_messages: ChatHistoryType + past_messages: ChatHistoryType = field(default_factory=list) @dataclass(kw_only=True) class EmailInstruction(Instruction): - thread: List[str] + thread: List[str] = field(default_factory=list) diff --git a/tests/test_entities.py b/tests/test_entities.py index 9740da0..90cb28d 100644 --- a/tests/test_entities.py +++ b/tests/test_entities.py @@ -3,7 +3,7 @@ import pytest -from panza3.entities import Email +from panza3.entities import Email, EmailInstruction def test_email_serialization_deserialization(): @@ -20,3 +20,23 @@ def test_email_processing(): deserialized = Email.deserialize(processed[0].metadata["serialized_document"]) assert email == deserialized + + +def test_email_instruction_init(): + instruction = EmailInstruction(instruction="Write an email.") + assert instruction.instruction == "Write an email." + assert instruction.thread == [] + assert instruction.past_messages == [] + + instruction = EmailInstruction( + instruction="Write an email.", + thread=["thread"], + past_messages=[{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hi!"}], + ) + + assert instruction.instruction == "Write an email." + assert instruction.thread == ["thread"] + assert instruction.past_messages == [ + {"role": "user", "content": "Hi!"}, + {"role": "assistant", "content": "Hi!"}, + ] From c9ea456ceedcddf8e1a52470f733b9d64538a858 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sat, 24 Aug 2024 20:39:32 +0200 Subject: [PATCH 013/112] Add email prompt builder --- src/panza/utils/prompting.py | 131 ---------------------- src/panza3/entities/document.py | 4 +- src/panza3/prompting/__init__.py | 3 +- src/panza3/prompting/email_prompting.py | 137 ++++++++++++++++++++++++ src/panza3/prompting/utils.py | 20 ++++ tests/conftest.py | 35 ++++++ tests/test_prompting.py | 111 +++++++++++++++++++ 7 files changed, 307 insertions(+), 134 deletions(-) create mode 100644 src/panza3/prompting/email_prompting.py create mode 100644 src/panza3/prompting/utils.py create mode 100644 tests/test_prompting.py diff --git a/src/panza/utils/prompting.py b/src/panza/utils/prompting.py index 6096b79..38f4fbe 100644 --- a/src/panza/utils/prompting.py +++ b/src/panza/utils/prompting.py @@ -18,137 +18,6 @@ PHI3_RESPONSE_END_WRAPPER = "<|end|>" -def create_prompt( - user_input: Text, - system_preamble: Text, - user_preamble: Text, - rag_preamble: Optional[Text] = None, - relevant_emails: Optional[List[Email]] = None, - thread_preamble: Optional[Text] = None, - thread_emails: Optional[List[Text]] = None, -) -> Text: - - if relevant_emails: - assert rag_preamble, "RAG preamble format must be provided if similar emails are provided." - rag_prompt = _create_rag_preamble_from_emails(rag_preamble, relevant_emails).strip() - else: - rag_prompt = "" - - if thread_emails: - assert thread_preamble, "Thread preamble format must be provided if thread is provided." - thread_prompt = _create_threading_preamble( - thread_preamble, thread_emails - ).strip() - else: - thread_prompt = "" - - system_preamble = system_preamble.strip() - user_preamble = user_preamble.strip() - - prompt = "" - if system_preamble: - prompt += f"{system_preamble}\n\n" - if user_preamble: - prompt += f"{user_preamble}\n\n" - if rag_prompt: - prompt += f"{rag_prompt}\n\n" - if thread_prompt: - prompt += f"{thread_prompt}\n\n" - prompt += f"Instruction: {user_input}" - - return prompt - - -def _create_rag_preamble_from_emails(rag_preamble_format: Text, emails: List[Email]) -> Text: - rag_context = _create_rag_context_from_emails(emails) - return rag_preamble_format.format(rag_context=rag_context) - - -def _create_rag_context_from_emails(emails: List[Email]) -> Text: - """Creates a RAG context from a list of relevant e-mails. - - The e-mails are formatted as follows: - - SUBJECT: - E-MAIL CONTENT: - - - --- - - SUBJECT: - E-MAIL CONTENT: - - - --- - ... - """ - - rag_context = "" - for email in emails: - rag_context += f"SUBJECT: {email.subject}\n" f"E-MAIL CONTENT:\n{email.email}\n\n---\n\n" - - return rag_context - - -def _create_threading_preamble( - threading_preamble_format: Text, thread: List[Text] -) -> Text: - threading_context = _create_threading_context(thread) - return threading_preamble_format.format(threading_context=threading_context) - - -def _create_threading_context(thread: List[Text]) -> Text: - """Creates a threading context from a list of relevant e-mails. - - The e-mails are formatted as follows: - - - - --- - - - - --- - ... - """ - - threading_context = "" - for email in thread: - threading_context += f"{email}\n\n---\n\n" - - return threading_context - - -def load_preamble(path): - with open(path, "r") as file: - return file.read().strip() - - -# The user preamble must be edited by the user in order to work as intended. -# Here, we perform additional checks to make sure that that happened; if not, -# We issue a warning to the user. -def load_user_preamble(path): - with open(path, "r") as file: - lines = [l for l in file.readlines() if not l.strip().startswith("#")] - print(lines) - preamble = "".join(lines) - if "CHANGE ME" in preamble: - print( - "*" * 66 - + "\n* WARNING: User prompt preamble not customized. *\n* Please edit the preamble at prompt_preambles/user_preamble.txt *\n" - + "*" * 66 - ) - return preamble - - -def load_all_preambles(system_preamble, user_preamble, rag_preamble, thread_preamble): - system_preamble = load_preamble(system_preamble) if system_preamble else "" - user_preamble = load_user_preamble(user_preamble) if user_preamble else "" - rag_preamble = load_preamble(rag_preamble) if rag_preamble else "" - thread_preamble = load_preamble(thread_preamble) if thread_preamble else "" - return system_preamble, user_preamble, rag_preamble, thread_preamble - - def get_model_special_tokens(model_name): model_name = model_name.lower() if "llama" in model_name: diff --git a/src/panza3/entities/document.py b/src/panza3/entities/document.py index ff7f0dd..7b06dd2 100644 --- a/src/panza3/entities/document.py +++ b/src/panza3/entities/document.py @@ -1,7 +1,7 @@ import copy import json from abc import ABC, abstractmethod -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from datetime import datetime from typing import Dict, List, Optional, Union @@ -37,7 +37,7 @@ def process( class Email(Document): email: str subject: str - thread: List[str] + thread: List[str] = field(default_factory=list) date: datetime def serialize(self) -> dict: diff --git a/src/panza3/prompting/__init__.py b/src/panza3/prompting/__init__.py index 3b89e60..4257c7c 100644 --- a/src/panza3/prompting/__init__.py +++ b/src/panza3/prompting/__init__.py @@ -1,3 +1,4 @@ from .base import PromptBuilder +from .email_prompting import EmailPromptBuilder -__all__ = ["PromptBuilder"] +__all__ = ["PromptBuilder", "EmailPromptBuilder"] diff --git a/src/panza3/prompting/email_prompting.py b/src/panza3/prompting/email_prompting.py new file mode 100644 index 0000000..b850982 --- /dev/null +++ b/src/panza3/prompting/email_prompting.py @@ -0,0 +1,137 @@ +from abc import ABC, abstractmethod +from typing import List, Tuple + +from ..entities import Email, EmailInstruction +from ..retriever import DocumentRetriever +from .base import PromptBuilder +from .utils import load_preamble, load_user_preamble + + +class EmailPromptBuilder(PromptBuilder): + def __init__( + self, + retriever: DocumentRetriever, + system_preamble: str, + user_preamble: str, + rag_preamble: str, + thread_preamble: str, + number_rag_emails: int, + rag_relevance_threshold: float, + number_thread_emails: int, + ): + self.retriever = retriever + self.system_preamble = system_preamble + self.user_preamble = user_preamble + self.rag_preamble = rag_preamble + self.thread_preamble = thread_preamble + self.number_rag_emails = number_rag_emails + self.rag_relevance_threshold = rag_relevance_threshold + self.number_thread_emails = number_thread_emails + + def _create_rag_preamble_from_emails(self, emails: List[Email]) -> str: + rag_context = self._create_rag_context_from_emails(emails) + return self.rag_preamble.format(rag_context=rag_context) + + def _create_rag_context_from_emails(self, emails: List[Email]) -> str: + """Creates a RAG context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + E-MAIL CONTENT: + + + --- + + E-MAIL CONTENT: + + + --- + ... + """ + + rag_context = "" + for email in emails: + rag_context += f"E-MAIL CONTENT:\n{email.email}\n\n---\n\n" + + return rag_context + + def _create_threading_preamble(self, thread: List[str]) -> str: + threading_context = self._create_threading_context(thread) + return self.thread_preamble.format(threading_context=threading_context) + + def _create_threading_context(self, thread: List[str]) -> str: + """Creates a threading context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + + + --- + + + + --- + ... + """ + + threading_context = "" + for email in thread: + threading_context += f"{email}\n\n---\n\n" + + return threading_context + + @staticmethod + def load_all_preambles( + system_preamble_path: str, + user_preamble_path: str, + rag_preamble_path: str, + thread_preamble_path: str, + ) -> Tuple[str, str, str, str]: + """Load all preambles from file.""" + system_preamble = load_preamble(system_preamble_path) if system_preamble_path else "" + user_preamble = load_user_preamble(user_preamble_path) if user_preamble_path else "" + rag_preamble = load_preamble(rag_preamble_path) if rag_preamble_path else "" + thread_preamble = load_preamble(thread_preamble_path) if thread_preamble_path else "" + return system_preamble, user_preamble, rag_preamble, thread_preamble + + def build_prompt( + self, + instruction: EmailInstruction, + use_rag: bool = True, + use_thread: bool = True, + ) -> str: + + if use_rag and not self.rag_preamble: + raise ValueError("RAG preamble format must be provided if RAG is used.") + + if use_thread and not self.thread_preamble: + raise ValueError("Thread preamble format must be provided if thread is used.") + + if use_rag: + relevant_emails = self.retriever.retrieve( + instruction.instruction, self.number_rag_emails, self.rag_relevance_threshold + ) + rag_prompt = self._create_rag_preamble_from_emails(relevant_emails).strip() + else: + rag_prompt = "" + + if use_thread: + thread_prompt = self._create_threading_preamble(instruction.thread).strip() + else: + thread_prompt = "" + + system_preamble = self.system_preamble.strip() + user_preamble = self.user_preamble.strip() + + prompt = "" + if system_preamble: + prompt += f"{system_preamble}\n\n" + if user_preamble: + prompt += f"{user_preamble}\n\n" + if rag_prompt: + prompt += f"{rag_prompt}\n\n" + if thread_prompt: + prompt += f"{thread_prompt}\n\n" + prompt += f"Instruction: {instruction.instruction}" + + return prompt diff --git a/src/panza3/prompting/utils.py b/src/panza3/prompting/utils.py new file mode 100644 index 0000000..f04f32b --- /dev/null +++ b/src/panza3/prompting/utils.py @@ -0,0 +1,20 @@ +def load_preamble(path: str) -> str: + with open(path, "r") as file: + return file.read().strip() + + +def load_user_preamble(path: str) -> str: + # The user preamble must be edited by the user in order to work as intended. + # Here, we perform additional checks to make sure that that happened; if not, + # We issue a warning to the user. + with open(path, "r") as file: + lines = [l for l in file.readlines() if not l.strip().startswith("#")] + print(lines) + preamble = "".join(lines) + if "CHANGE ME" in preamble: + print( + "*" * 66 + + "\n* WARNING: User prompt preamble not customized. *\n* Please edit the preamble at prompt_preambles/user_preamble.txt *\n" + + "*" * 66 + ) + return preamble diff --git a/tests/conftest.py b/tests/conftest.py index 9e62025..e27f81d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,3 +44,38 @@ def faiss_db_path(tmp_path: Path, index_name: str, embedding_model: str) -> Path # Return the path to the directory containing all mock data return base_temp_dir + + +@pytest.fixture +def preambles_path(tmp_path: Path) -> Path: + preambles_path = tmp_path / "prompt_preambles" + preambles_path.mkdir(parents=True) + return preambles_path + + +@pytest.fixture +def system_preamble_path(preambles_path) -> Path: + system_preamble_path = preambles_path / "system_preamble.txt" + system_preamble_path.write_text("") + return system_preamble_path + + +@pytest.fixture +def user_preamble_path(preambles_path) -> Path: + user_preamble_path = preambles_path / "user_preamble.txt" + user_preamble_path.write_text("") + return user_preamble_path + + +@pytest.fixture +def rag_preamble_path(preambles_path) -> Path: + rag_preamble_path = preambles_path / "rag_preamble.txt" + rag_preamble_path.write_text("RAG PREAMBLE:\n\n{rag_context}") + return rag_preamble_path + + +@pytest.fixture +def thread_preamble_path(preambles_path) -> Path: + thread_preamble_path = preambles_path / "thread_preamble.txt" + thread_preamble_path.write_text("THREAD PREAMBLE:\n\n{threading_context}") + return thread_preamble_path diff --git a/tests/test_prompting.py b/tests/test_prompting.py new file mode 100644 index 0000000..bdd4eef --- /dev/null +++ b/tests/test_prompting.py @@ -0,0 +1,111 @@ +from datetime import datetime +from pathlib import Path +from unittest.mock import MagicMock + +import pytest + +from panza3.entities import Email, EmailInstruction +from panza3.prompting import EmailPromptBuilder +from panza3.retriever import FaissRetriever + + +def test_email_prompt_builder( + system_preamble_path: Path, + user_preamble_path: Path, + rag_preamble_path: Path, + thread_preamble_path: Path, +): + # TODO: Split into multiple tests + + # Patch the retrieve method to return a list of emails + mock_retriever = MagicMock(spec=FaissRetriever) + emails = [ + Email(email=f"email{i}", subject=f"subject{i}", thread=[f"thread{i}"], date=datetime.now()) + for i in range(3) + ] + mock_retriever.retrieve.return_value = emails + + instruction = EmailInstruction( + instruction="Write an email.", thread=["email1", "email2", "email3"] + ) + + system_preamble, user_preamble, rag_preamble, thread_preamble = ( + EmailPromptBuilder.load_all_preambles( + system_preamble_path=system_preamble_path, + user_preamble_path=user_preamble_path, + rag_preamble_path=rag_preamble_path, + thread_preamble_path=thread_preamble_path, + ) + ) + + prompt_builder = EmailPromptBuilder( + retriever=mock_retriever, + system_preamble=system_preamble, + user_preamble=user_preamble, + rag_preamble=rag_preamble, + thread_preamble=thread_preamble, + number_rag_emails=3, + rag_relevance_threshold=0.0, + number_thread_emails=1, + ) + + rag_prompt = prompt_builder._create_rag_preamble_from_emails(emails=emails) + + assert rag_prompt == ( + "RAG PREAMBLE:\n\n" + + "E-MAIL CONTENT:\nemail0\n\n---\n\n" + + "E-MAIL CONTENT:\nemail1\n\n---\n\n" + + "E-MAIL CONTENT:\nemail2\n\n---\n\n" + ) + + thread_prompt = prompt_builder._create_threading_preamble(thread=instruction.thread) + + assert thread_prompt == ( + "THREAD PREAMBLE:\n\n" + "email1\n\n---\n\n" + "email2\n\n---\n\n" + "email3\n\n---\n\n" + ) + + # Test full prompt + prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=True, use_thread=True) + assert prompt == ( + "\n\n" + + "\n\n" + + "RAG PREAMBLE:\n\n" + + "E-MAIL CONTENT:\nemail0\n\n---\n\n" + + "E-MAIL CONTENT:\nemail1\n\n---\n\n" + + "E-MAIL CONTENT:\nemail2\n\n---\n\n" + + "THREAD PREAMBLE:\n\n" + + "email1\n\n---\n\n" + + "email2\n\n---\n\n" + + "email3\n\n---\n\n" + + "Instruction: Write an email." + ) + + # Test prompt without RAG + prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=False, use_thread=True) + assert prompt == ( + "\n\n" + + "\n\n" + + "THREAD PREAMBLE:\n\n" + + "email1\n\n---\n\n" + + "email2\n\n---\n\n" + + "email3\n\n---\n\n" + + "Instruction: Write an email." + ) + + # Test prompt without thread + prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=True, use_thread=False) + assert prompt == ( + "\n\n" + + "\n\n" + + "RAG PREAMBLE:\n\n" + + "E-MAIL CONTENT:\nemail0\n\n---\n\n" + + "E-MAIL CONTENT:\nemail1\n\n---\n\n" + + "E-MAIL CONTENT:\nemail2\n\n---\n\n" + + "Instruction: Write an email." + ) + + # Test prompt without RAG and thread + prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=False, use_thread=False) + assert prompt == ( + "\n\n" + "\n\n" + "Instruction: Write an email." + ) From 6a9897eee89e914e26c98ce89a827e15b058ed0a Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sun, 25 Aug 2024 21:07:36 +0200 Subject: [PATCH 014/112] Add local transformers inference --- src/panza/evaluation/base_inference.py | 59 -------------- src/panza3/llm/__init__.py | 3 +- src/panza3/llm/base.py | 5 +- src/panza3/llm/local.py | 101 ++++++++++++++++++++++++ src/panza3/prompting/email_prompting.py | 1 - tests/conftest.py | 3 + tests/test_llm.py | 59 ++++++++++++++ 7 files changed, 168 insertions(+), 63 deletions(-) create mode 100644 src/panza3/llm/local.py create mode 100644 tests/test_llm.py diff --git a/src/panza/evaluation/base_inference.py b/src/panza/evaluation/base_inference.py index adc79bb..744f635 100644 --- a/src/panza/evaluation/base_inference.py +++ b/src/panza/evaluation/base_inference.py @@ -3,9 +3,6 @@ import sys import torch -from peft import AutoPeftModelForCausalLM -from transformers import (AutoModelForCausalLM, AutoTokenizer, - BitsAndBytesConfig) sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) @@ -45,21 +42,6 @@ def get_base_inference_args_parser(): def load_model_and_tokenizer(model_path, device, dtype, load_in_4bit): - assert dtype in [None, "fp32", "bf16"] - if device == "cpu": - assert dtype == "fp32", "CPU only supports fp32, please specify --dtype fp32" - dtype = None if dtype is None else (torch.float32 if dtype == "fp32" else torch.bfloat16) - - quant_config = ( - BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=dtype, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - ) - if load_in_4bit - else None - ) if os.path.exists(os.path.join(model_path, "adapter_config.json")): print("found an adapter.") @@ -131,44 +113,3 @@ def run_inference( prompts.append(prompt) messages = [{"role": "user", "content": prompt}] batch.append(messages) - - encodeds = tokenizer.apply_chat_template( - batch, - return_tensors="pt", - add_generation_prompt=True, - padding=True, - truncation=True, - return_dict=True, - ) - model_inputs = encodeds.to(device) - - if best: - generated_ids = model.generate( - **model_inputs, - max_new_tokens=max_new_tokens, - do_sample=False, - num_beams=1, - pad_token_id=tokenizer.pad_token_id, - ) - else: - generated_ids = model.generate( - **model_inputs, - max_new_tokens=max_new_tokens, - do_sample=True, - temperature=temperature, - top_k=top_k, - top_p=top_p, - pad_token_id=tokenizer.pad_token_id, - ) - - outputs = tokenizer.batch_decode(generated_ids) - - # Clean outputs - _, prompt_end_wrapper, _, response_end_wrapper = prompting.get_model_special_tokens( - model.name_or_path - ) - outputs = [ - output.split(prompt_end_wrapper)[-1].split(response_end_wrapper)[0] for output in outputs - ] - - return prompts, outputs diff --git a/src/panza3/llm/__init__.py b/src/panza3/llm/__init__.py index c278cd2..6d016d6 100644 --- a/src/panza3/llm/__init__.py +++ b/src/panza3/llm/__init__.py @@ -1,4 +1,5 @@ from .base import LLM, ChatHistoryType, MessageType +from .local import TransformersLLM from .ollama_llm import OllamaLLM -__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM"] +__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM", "TransformersLLM"] diff --git a/src/panza3/llm/base.py b/src/panza3/llm/base.py index 3bcad4f..6293881 100644 --- a/src/panza3/llm/base.py +++ b/src/panza3/llm/base.py @@ -6,11 +6,12 @@ class LLM(ABC): - def __init__(self, name: str): + def __init__(self, name: str, sampling_parameters: Dict): self.name = name + self.sampling_parameters = sampling_parameters @abstractmethod - def chat(self, messages: ChatHistoryType) -> str: + def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: pass @abstractmethod diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py new file mode 100644 index 0000000..5d122d9 --- /dev/null +++ b/src/panza3/llm/local.py @@ -0,0 +1,101 @@ +from typing import Dict, Iterator, List + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +from .base import LLM, ChatHistoryType + + +class LocalLLM(LLM): + def __init__( + self, + name: str, + checkpoint: str, + device: str, + sampling_parameters: Dict, + dtype: str, + load_in_4bit: bool, + ): + super().__init__(name, sampling_parameters) + self.checkpoint = checkpoint + self.device = device + + assert dtype in [None, "fp32", "bf16"] + if device == "cpu": + assert dtype == "fp32", "CPU only supports fp32, please specify --dtype fp32" + dtype = None if dtype is None else (torch.float32 if dtype == "fp32" else torch.bfloat16) + self.dtype = dtype + + self.load_in_4bit = load_in_4bit + # TODO: Add conditional import for BitsAndBytesConfig? + self.quantization_config = ( + BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + if load_in_4bit + else None + ) + + self._load_model_and_tokenizer() + + def _load_model_and_tokenizer(self) -> None: + pass + + +class TransformersLLM(LocalLLM): + + def _load_model_and_tokenizer(self): + if self.load_in_4bit: + self.model = AutoModelForCausalLM.from_pretrained( + self.checkpoint, + device_map=self.device, + quantization_config=self.quantization_config, + trust_remote_code=True, + ) + else: + self.model = AutoModelForCausalLM.from_pretrained( + self.checkpoint, + torch_dtype=self.dtype, + device_map=self.device, + trust_remote_code=True, + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.checkpoint, model_max_length=self.model.config.max_position_embeddings + ) + self.tokenizer.padding_side = "left" + self.tokenizer.pad_token = self.tokenizer.eos_token + + def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: + encodeds = self.tokenizer.apply_chat_template( + messages, + return_tensors="pt", + add_generation_prompt=True, + padding=True, + truncation=True, + return_dict=True, + ) + model_inputs = encodeds.to(self.device) + + generated_ids = self.model.generate( + **model_inputs, + **self.sampling_parameters, + pad_token_id=self.tokenizer.pad_token_id, + ) + + prompt_length = encodeds["input_ids"].shape[1] + outputs = self.tokenizer.batch_decode( + generated_ids[:, prompt_length:], skip_special_tokens=True + ) + + return outputs + + def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: + if isinstance(messages[0], (list, tuple)) or hasattr(messages[0], "messages"): + raise TypeError("chat_stream does not support batched messages.") + + # TODO: Implement chat_stream. + raise NotImplementedError("chat_stream is not implemented for TransformersLLM.") diff --git a/src/panza3/prompting/email_prompting.py b/src/panza3/prompting/email_prompting.py index b850982..8ca0390 100644 --- a/src/panza3/prompting/email_prompting.py +++ b/src/panza3/prompting/email_prompting.py @@ -1,4 +1,3 @@ -from abc import ABC, abstractmethod from typing import List, Tuple from ..entities import Email, EmailInstruction diff --git a/tests/conftest.py b/tests/conftest.py index e27f81d..fa6008d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,9 @@ def embedding_model() -> str: return "sentence-transformers/all-mpnet-base-v2" +@pytest.fixture +def generative_model() -> str: + return "microsoft/Phi-3-mini-4k-instruct" @pytest.fixture def index_name() -> str: diff --git a/tests/test_llm.py b/tests/test_llm.py new file mode 100644 index 0000000..4291152 --- /dev/null +++ b/tests/test_llm.py @@ -0,0 +1,59 @@ +import pytest + +from panza3.llm import TransformersLLM + + +def test_transformers_llm_init(generative_model: str): + model = TransformersLLM( + name="huggingface_model", + checkpoint=generative_model, + device="cpu", + sampling_parameters={"do_sample": False, "max_new_tokens": 50}, + dtype="fp32", + load_in_4bit=False, + ) + assert model is not None + assert model.name == "huggingface_model" + assert model.checkpoint == generative_model + assert model.model is not None + assert model.tokenizer is not None + + +def test_transformers_llm_generate(generative_model: str): + model = TransformersLLM( + name="huggingface_model", + checkpoint=generative_model, + device="cpu", + sampling_parameters={"do_sample": False, "max_new_tokens": 50}, + dtype="fp32", + load_in_4bit=False, + ) + + messages = [{"role": "user", "content": "Write something."}] + + outputs = model.chat(messages) + + assert outputs is not None + assert len(outputs) == 1 + + +def test_transformers_llm_generate_batch(generative_model: str): + model = TransformersLLM( + name="huggingface_model", + checkpoint=generative_model, + device="cpu", + sampling_parameters={"do_sample": False, "max_new_tokens": 50}, + dtype="fp32", + load_in_4bit=False, + ) + + messages = [ + [{"role": "user", "content": "Write something."}], + [{"role": "user", "content": "Write something else."}], + [{"role": "user", "content": "Write something different."}], + ] + + outputs = model.chat(messages) + + assert outputs is not None + assert len(outputs) == 3 From 30b3dc1068fa373beaacf89e80f4308f85500c3f Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Mon, 26 Aug 2024 12:42:00 +0200 Subject: [PATCH 015/112] Add Peft models and conditional imports --- src/panza3/llm/__init__.py | 4 +- src/panza3/llm/local.py | 113 +++++++++++++++++++++++++++---------- tests/conftest.py | 4 ++ tests/test_llm.py | 87 +++++++++++++++++++++++----- 4 files changed, 160 insertions(+), 48 deletions(-) diff --git a/src/panza3/llm/__init__.py b/src/panza3/llm/__init__.py index 6d016d6..20e4146 100644 --- a/src/panza3/llm/__init__.py +++ b/src/panza3/llm/__init__.py @@ -1,5 +1,5 @@ from .base import LLM, ChatHistoryType, MessageType -from .local import TransformersLLM +from .local import PeftLLM, TransformersLLM from .ollama_llm import OllamaLLM -__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM", "TransformersLLM"] +__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM", "TransformersLLM", "PeftLLM"] diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py index 5d122d9..bc345e8 100644 --- a/src/panza3/llm/local.py +++ b/src/panza3/llm/local.py @@ -1,7 +1,29 @@ -from typing import Dict, Iterator, List +from abc import abstractmethod +from typing import Any, Dict, Iterator, List, Type import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +_MISSING_LIBRARIES = [] + +try: + from peft import AutoPeftModelForCausalLM +except ImportError: + AutoPeftModelForCausalLM = None + _MISSING_LIBRARIES.append("peft") + +try: + from transformers import AutoModelForCausalLM, AutoTokenizer +except ImportError: + AutoModelForCausalLM = None + AutoTokenizer = None + _MISSING_LIBRARIES.append("transformers") + +try: + from transformers import BitsAndBytesConfig +except ImportError: + BitsAndBytesConfig = None + _MISSING_LIBRARIES.append("bitsandbytes") + from .base import LLM, ChatHistoryType @@ -16,6 +38,8 @@ def __init__( dtype: str, load_in_4bit: bool, ): + self._check_installation() + super().__init__(name, sampling_parameters) self.checkpoint = checkpoint self.device = device @@ -41,34 +65,6 @@ def __init__( self._load_model_and_tokenizer() - def _load_model_and_tokenizer(self) -> None: - pass - - -class TransformersLLM(LocalLLM): - - def _load_model_and_tokenizer(self): - if self.load_in_4bit: - self.model = AutoModelForCausalLM.from_pretrained( - self.checkpoint, - device_map=self.device, - quantization_config=self.quantization_config, - trust_remote_code=True, - ) - else: - self.model = AutoModelForCausalLM.from_pretrained( - self.checkpoint, - torch_dtype=self.dtype, - device_map=self.device, - trust_remote_code=True, - ) - - self.tokenizer = AutoTokenizer.from_pretrained( - self.checkpoint, model_max_length=self.model.config.max_position_embeddings - ) - self.tokenizer.padding_side = "left" - self.tokenizer.pad_token = self.tokenizer.eos_token - def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: encodeds = self.tokenizer.apply_chat_template( messages, @@ -98,4 +94,59 @@ def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: raise TypeError("chat_stream does not support batched messages.") # TODO: Implement chat_stream. - raise NotImplementedError("chat_stream is not implemented for TransformersLLM.") + raise NotImplementedError("chat_stream is not implemented for LocalLLM.") + + def _check_installation(self) -> None: + if AutoModelForCausalLM is None or AutoTokenizer is None: + raise ImportError( + "transformers is not installed. Please install it with `pip install transformers`." + ) + + if BitsAndBytesConfig is None: + from transformers import __version__ as version + + raise ImportError( + f"transformers {version} does not support 4-bit quantization. Please upgrade to a newer version." + ) + + def _load_model_and_tokenizer_with_constructor(self, model_class: Type[Any]) -> None: + if self.load_in_4bit: + self.model = model_class.from_pretrained( + self.checkpoint, + device_map=self.device, + quantization_config=self.quantization_config, + trust_remote_code=True, + ) + else: + self.model = model_class.from_pretrained( + self.checkpoint, + torch_dtype=self.dtype, + device_map=self.device, + trust_remote_code=True, + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + self.checkpoint, model_max_length=self.model.config.max_position_embeddings + ) + self.tokenizer.padding_side = "left" + self.tokenizer.pad_token = self.tokenizer.eos_token + + @abstractmethod + def _load_model_and_tokenizer(self) -> None: + pass + + +class TransformersLLM(LocalLLM): + def _load_model_and_tokenizer(self): + self._load_model_and_tokenizer_with_constructor(AutoModelForCausalLM) + + +class PeftLLM(LocalLLM): + def _check_installation(self) -> None: + super()._check_installation() + if AutoPeftModelForCausalLM is None: + raise ImportError("peft is not installed.") + + def _load_model_and_tokenizer(self) -> None: + self._load_model_and_tokenizer_with_constructor(AutoPeftModelForCausalLM) + self.model = self.model.merge_and_unload() diff --git a/tests/conftest.py b/tests/conftest.py index fa6008d..11d13ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,6 +15,10 @@ def embedding_model() -> str: def generative_model() -> str: return "microsoft/Phi-3-mini-4k-instruct" +@pytest.fixture +def peft_model() -> str: + return "microsoft/Phi-3-mini-4k-instruct" + @pytest.fixture def index_name() -> str: return "test-index" diff --git a/tests/test_llm.py b/tests/test_llm.py index 4291152..3ba5ab9 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,28 +1,71 @@ +from typing import Type + import pytest +from torch import float32 as torch_float32 + +from panza3.llm import PeftLLM, TransformersLLM +from panza3.llm.local import _MISSING_LIBRARIES, LocalLLM -from panza3.llm import TransformersLLM +skip_if_no_transformers = pytest.mark.skipif( + "transformers" in _MISSING_LIBRARIES, reason="transformers is not installed" +) +skip_if_no_peft = pytest.mark.skipif("peft" in _MISSING_LIBRARIES, reason="peft is not installed") +skip_if_no_bitsandbytes = pytest.mark.skipif( + "bitsandbytes" in _MISSING_LIBRARIES, reason="bitsandbytes is not installed" +) -def test_transformers_llm_init(generative_model: str): - model = TransformersLLM( - name="huggingface_model", - checkpoint=generative_model, +@pytest.mark.parametrize( + "local_llm_class, checkpoint", + [ + pytest.param( + TransformersLLM, "microsoft/Phi-3-mini-4k-instruct", marks=skip_if_no_transformers + ), + # TODO: Replace local Peft checkpoint with fixture + pytest.param( + PeftLLM, + "/nfs/scistore19/alistgrp/Checkpoints/Panza/shared/armand/models/test_rosa_checkpoint", + marks=[skip_if_no_transformers, skip_if_no_peft], + ), + ], +) +def test_local_llm_init(local_llm_class: Type[LocalLLM], checkpoint: str): + model = local_llm_class( + name="local_llm", + checkpoint=checkpoint, device="cpu", sampling_parameters={"do_sample": False, "max_new_tokens": 50}, dtype="fp32", load_in_4bit=False, ) assert model is not None - assert model.name == "huggingface_model" - assert model.checkpoint == generative_model + assert model.name == "local_llm" + assert model.checkpoint == checkpoint assert model.model is not None assert model.tokenizer is not None + assert model.model.device.type == "cpu" + assert model.dtype == torch_float32 + assert model.model.dtype == model.dtype -def test_transformers_llm_generate(generative_model: str): - model = TransformersLLM( - name="huggingface_model", - checkpoint=generative_model, +@pytest.mark.parametrize( + "local_llm_class, checkpoint", + [ + pytest.param( + TransformersLLM, "microsoft/Phi-3-mini-4k-instruct", marks=skip_if_no_transformers + ), + # TODO: Replace local Peft checkpoint with fixture + pytest.param( + PeftLLM, + "/nfs/scistore19/alistgrp/Checkpoints/Panza/shared/armand/models/test_rosa_checkpoint", + marks=[skip_if_no_transformers, skip_if_no_peft], + ), + ], +) +def test_local_llm_generate(local_llm_class: Type[LocalLLM], checkpoint: str): + model = local_llm_class( + name="local_llm", + checkpoint=checkpoint, device="cpu", sampling_parameters={"do_sample": False, "max_new_tokens": 50}, dtype="fp32", @@ -37,10 +80,24 @@ def test_transformers_llm_generate(generative_model: str): assert len(outputs) == 1 -def test_transformers_llm_generate_batch(generative_model: str): - model = TransformersLLM( - name="huggingface_model", - checkpoint=generative_model, +@pytest.mark.parametrize( + "local_llm_class, checkpoint", + [ + pytest.param( + TransformersLLM, "microsoft/Phi-3-mini-4k-instruct", marks=skip_if_no_transformers + ), + # TODO: Replace local Peft checkpoint with fixture + pytest.param( + PeftLLM, + "/nfs/scistore19/alistgrp/Checkpoints/Panza/shared/armand/models/test_rosa_checkpoint", + marks=[skip_if_no_transformers, skip_if_no_peft], + ), + ], +) +def test_local_llm_generate_batch(local_llm_class: Type[LocalLLM], checkpoint: str): + model = local_llm_class( + name="local_llm", + checkpoint=checkpoint, device="cpu", sampling_parameters={"do_sample": False, "max_new_tokens": 50}, dtype="fp32", From 1d73affbe2e5d324782d5422ce07fef943e32739 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Mon, 26 Aug 2024 17:06:19 +0200 Subject: [PATCH 016/112] Add Panza Writer --- src/panza3/writer.py | 28 ++++++++++++++++++++++++++++ src/panza3/writer/__init__.py | 0 src/panza3/writer/base.py | 15 --------------- tests/test_writer.py | 27 +++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 src/panza3/writer.py delete mode 100644 src/panza3/writer/__init__.py delete mode 100644 src/panza3/writer/base.py create mode 100644 tests/test_writer.py diff --git a/src/panza3/writer.py b/src/panza3/writer.py new file mode 100644 index 0000000..8c6a2bf --- /dev/null +++ b/src/panza3/writer.py @@ -0,0 +1,28 @@ +from typing import Iterator, List + +from .entities import Instruction +from .llm import LLM, MessageType +from .prompting import PromptBuilder + + +# TODO: Check that instruction type is compatible with prompt_builder type? +class PanzaWriter: + def __init__(self, prompt_builder: PromptBuilder, llm: LLM): + self.prompt_builder = prompt_builder + self.llm = llm + + def run(self, instruction: Instruction, stream: bool = False) -> str | Iterator[str]: + prompt = self.prompt_builder.build_prompt(instruction) + messages = self._create_user_message(content=prompt) + if stream: + return self.llm.chat_stream(messages) + else: + return self.llm.chat(messages)[0] + + def run_batch(self, instructions: List[Instruction]) -> List[str]: + prompts = [self.prompt_builder.build_prompt(instruction) for instruction in instructions] + messages = [self._create_user_message(content=prompt) for prompt in prompts] + return self.llm.chat(messages) + + def _create_user_message(self, content: str) -> MessageType: + return [{"role": "user", "content": content}] diff --git a/src/panza3/writer/__init__.py b/src/panza3/writer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/panza3/writer/base.py b/src/panza3/writer/base.py deleted file mode 100644 index 5f767df..0000000 --- a/src/panza3/writer/base.py +++ /dev/null @@ -1,15 +0,0 @@ -from abc import ABC, abstractmethod - -from ..entities import Instruction -from ..llm import LLM -from ..prompting import PromptBuilder - - -class PanzaWriter(ABC): - def __init__(self, prompt_builder: PromptBuilder, llm: LLM): - self.prompt_builder = prompt_builder - self.llm = llm - - @abstractmethod - def run(self, instruction: Instruction) -> str: - pass diff --git a/tests/test_writer.py b/tests/test_writer.py new file mode 100644 index 0000000..1f0edde --- /dev/null +++ b/tests/test_writer.py @@ -0,0 +1,27 @@ +from unittest.mock import MagicMock + +import pytest + +from panza3.entities import EmailInstruction +from panza3.llm import LLM +from panza3.prompting import EmailPromptBuilder +from panza3.writer import PanzaWriter + + +def test_email_writer_init(): + # Create mock prompt builder + mock_builder = MagicMock(spec=EmailPromptBuilder) + mock_builder.build_prompt.side_effect = ( + lambda instruction: f"Instruction: {instruction.instruction}" + ) + + # Create mock LLM + mock_llm = MagicMock(spec=LLM) + mock_llm.chat.side_effect = lambda messages: [f"Received: {messages[0]['content']}"] + + panza_writer = PanzaWriter(mock_builder, mock_llm) + + instruction = EmailInstruction(instruction="Write an email.") + + output = panza_writer.run(instruction) + assert output == "Received: Instruction: Write an email." From c648f98e64471f4e7c1ccae1680276a37f3f8810 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 27 Aug 2024 11:57:58 +0200 Subject: [PATCH 017/112] Add support to return full prompt from writer --- src/panza3/writer.py | 28 ++++++++++++++++++++++------ tests/test_writer.py | 6 +++++- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/panza3/writer.py b/src/panza3/writer.py index 8c6a2bf..885c59e 100644 --- a/src/panza3/writer.py +++ b/src/panza3/writer.py @@ -1,4 +1,4 @@ -from typing import Iterator, List +from typing import Iterator, List, Tuple from .entities import Instruction from .llm import LLM, MessageType @@ -11,18 +11,34 @@ def __init__(self, prompt_builder: PromptBuilder, llm: LLM): self.prompt_builder = prompt_builder self.llm = llm - def run(self, instruction: Instruction, stream: bool = False) -> str | Iterator[str]: + def run( + self, instruction: Instruction, stream: bool = False, return_prompt: bool = False + ) -> str | Iterator[str] | Tuple[str, str] | Tuple[Iterator[str], str]: prompt = self.prompt_builder.build_prompt(instruction) messages = self._create_user_message(content=prompt) + if stream: - return self.llm.chat_stream(messages) + response = self.llm.chat_stream(messages) + else: + response = self.llm.chat(messages)[0] + + if return_prompt: + return response, prompt else: - return self.llm.chat(messages)[0] + return response - def run_batch(self, instructions: List[Instruction]) -> List[str]: + def run_batch( + self, instructions: List[Instruction], return_prompt: bool = False + ) -> List[str] | Tuple[List[str], List[str]]: prompts = [self.prompt_builder.build_prompt(instruction) for instruction in instructions] messages = [self._create_user_message(content=prompt) for prompt in prompts] - return self.llm.chat(messages) + + response = self.llm.chat(messages) + + if return_prompt: + return response, prompts + else: + return response def _create_user_message(self, content: str) -> MessageType: return [{"role": "user", "content": content}] diff --git a/tests/test_writer.py b/tests/test_writer.py index 1f0edde..48e9f5d 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -8,7 +8,7 @@ from panza3.writer import PanzaWriter -def test_email_writer_init(): +def test_email_writer(): # Create mock prompt builder mock_builder = MagicMock(spec=EmailPromptBuilder) mock_builder.build_prompt.side_effect = ( @@ -25,3 +25,7 @@ def test_email_writer_init(): output = panza_writer.run(instruction) assert output == "Received: Instruction: Write an email." + + output, prompt = panza_writer.run(instruction, return_prompt=True) + assert output == "Received: Instruction: Write an email." + assert prompt == "Instruction: Write an email." From 09f08e42b4cd8fb1f31891ddd7a9fba4c205d85e Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 27 Aug 2024 12:08:41 +0200 Subject: [PATCH 018/112] Set corresponding retriever document type in prompt builder --- src/panza3/prompting/email_prompting.py | 2 ++ src/panza3/retriever/base.py | 3 +++ src/panza3/retriever/faiss.py | 2 +- tests/test_retriever.py | 5 +++-- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/panza3/prompting/email_prompting.py b/src/panza3/prompting/email_prompting.py index 8ca0390..ebfc590 100644 --- a/src/panza3/prompting/email_prompting.py +++ b/src/panza3/prompting/email_prompting.py @@ -27,6 +27,8 @@ def __init__( self.rag_relevance_threshold = rag_relevance_threshold self.number_thread_emails = number_thread_emails + self.retriever.set_document_class(Email) + def _create_rag_preamble_from_emails(self, emails: List[Email]) -> str: rag_context = self._create_rag_context_from_emails(emails) return self.rag_preamble.format(rag_context=rag_context) diff --git a/src/panza3/retriever/base.py b/src/panza3/retriever/base.py index 05bc775..d0e3e37 100644 --- a/src/panza3/retriever/base.py +++ b/src/panza3/retriever/base.py @@ -22,3 +22,6 @@ def store(self, documents: List[Document]): @abstractmethod def save_db_to_disk(self): pass + + def set_document_class(self, document_class: type[Document]): + self.document_class = document_class diff --git a/src/panza3/retriever/faiss.py b/src/panza3/retriever/faiss.py index 7c4e567..466270b 100644 --- a/src/panza3/retriever/faiss.py +++ b/src/panza3/retriever/faiss.py @@ -19,7 +19,7 @@ def __init__( index_name: str, embedding_model: str, device: str, - document_class: type[Document], + document_class: Optional[type[Document]] = None, ) -> None: self.db_path = db_path diff --git a/tests/test_retriever.py b/tests/test_retriever.py index 8d7af0a..347df94 100644 --- a/tests/test_retriever.py +++ b/tests/test_retriever.py @@ -10,13 +10,14 @@ def get_faiss_retriever( db_path: Path, index_name: str, embedding_model: str, device: str ) -> FaissRetriever: - return FaissRetriever( + retriever = FaissRetriever( db_path=db_path, index_name=index_name, embedding_model=embedding_model, device=device, - document_class=Email, ) + retriever.set_document_class(Email) + return retriever def test_faiss_retriever_init_empty(tmp_path: Path, index_name: str, embedding_model: str): From a3c6ac6535aa663de19af9b043cb6172b09ee063 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 27 Aug 2024 12:10:14 +0200 Subject: [PATCH 019/112] Remove debugging print from prompting utils --- src/panza3/prompting/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/panza3/prompting/utils.py b/src/panza3/prompting/utils.py index f04f32b..ec3daee 100644 --- a/src/panza3/prompting/utils.py +++ b/src/panza3/prompting/utils.py @@ -9,7 +9,6 @@ def load_user_preamble(path: str) -> str: # We issue a warning to the user. with open(path, "r") as file: lines = [l for l in file.readlines() if not l.strip().startswith("#")] - print(lines) preamble = "".join(lines) if "CHANGE ME" in preamble: print( From 6b671564c8ed6505a528ba32aebfd374b9e1d17c Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 27 Aug 2024 12:58:35 +0200 Subject: [PATCH 020/112] Add Hydra config-based runner for Panza writer --- configs/base.yaml | 2 + configs/panza_writer.yaml | 7 +++ configs/user/default.yaml | 9 ++++ configs/writer/email.yaml | 5 ++ configs/writer/llm/ollama.yaml | 1 + configs/writer/llm/peft.yaml | 1 + configs/writer/llm/sampling/greedy.yaml | 2 + configs/writer/llm/sampling/random.yaml | 5 ++ configs/writer/llm/transformers.yaml | 9 ++++ configs/writer/prompting/email_prompting.yaml | 13 +++++ configs/writer/prompting/retriever/faiss.yaml | 5 ++ scripts/runner.py | 48 +++++++++++++++++++ src/panza3/__init__.py | 10 ++++ 13 files changed, 117 insertions(+) create mode 100644 configs/base.yaml create mode 100644 configs/panza_writer.yaml create mode 100644 configs/user/default.yaml create mode 100644 configs/writer/email.yaml create mode 100644 configs/writer/llm/ollama.yaml create mode 100644 configs/writer/llm/peft.yaml create mode 100644 configs/writer/llm/sampling/greedy.yaml create mode 100644 configs/writer/llm/sampling/random.yaml create mode 100644 configs/writer/llm/transformers.yaml create mode 100644 configs/writer/prompting/email_prompting.yaml create mode 100644 configs/writer/prompting/retriever/faiss.yaml create mode 100644 scripts/runner.py diff --git a/configs/base.yaml b/configs/base.yaml new file mode 100644 index 0000000..2d5ba57 --- /dev/null +++ b/configs/base.yaml @@ -0,0 +1,2 @@ +panza_workspace: ${hydra:runtime.cwd}/../ +seed: 42 diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml new file mode 100644 index 0000000..5de1f0c --- /dev/null +++ b/configs/panza_writer.yaml @@ -0,0 +1,7 @@ +defaults: + - base + - writer: email + - user: default + +embedding_model: "sentence-transformers/all-mpnet-base-v2" +checkpoint: "microsoft/Phi-3-mini-4k-instruct" diff --git a/configs/user/default.yaml b/configs/user/default.yaml new file mode 100644 index 0000000..853692c --- /dev/null +++ b/configs/user/default.yaml @@ -0,0 +1,9 @@ +email_address: "firstname.lastname@gmail.com" # Change this to your email address! +username: "firstname.lastname" # TODO(armand): Use custom resolver to extract username from email address. + +data_dir: ${panza_workspace}/data + +system_preamble_path: ${panza_workspace}/prompt_preambles/system_preamble.txt +user_preamble_path: ${panza_workspace}/prompt_preambles/user_preamble.txt +rag_preamble_path: ${panza_workspace}/prompt_preambles/rag_preamble.txt +thread_preamble_path: ${panza_workspace}/prompt_preambles/thread_preamble.txt \ No newline at end of file diff --git a/configs/writer/email.yaml b/configs/writer/email.yaml new file mode 100644 index 0000000..3b816d4 --- /dev/null +++ b/configs/writer/email.yaml @@ -0,0 +1,5 @@ +defaults: + - llm: transformers + - prompting: email_prompting + +_target_: panza3.writer.PanzaWriter diff --git a/configs/writer/llm/ollama.yaml b/configs/writer/llm/ollama.yaml new file mode 100644 index 0000000..1afe70a --- /dev/null +++ b/configs/writer/llm/ollama.yaml @@ -0,0 +1 @@ +# TODO: Add Ollama config \ No newline at end of file diff --git a/configs/writer/llm/peft.yaml b/configs/writer/llm/peft.yaml new file mode 100644 index 0000000..6933876 --- /dev/null +++ b/configs/writer/llm/peft.yaml @@ -0,0 +1 @@ +# TODO: Add PEFT config \ No newline at end of file diff --git a/configs/writer/llm/sampling/greedy.yaml b/configs/writer/llm/sampling/greedy.yaml new file mode 100644 index 0000000..5854169 --- /dev/null +++ b/configs/writer/llm/sampling/greedy.yaml @@ -0,0 +1,2 @@ +do_sample: false +max_new_tokens: 1024 \ No newline at end of file diff --git a/configs/writer/llm/sampling/random.yaml b/configs/writer/llm/sampling/random.yaml new file mode 100644 index 0000000..e5a954a --- /dev/null +++ b/configs/writer/llm/sampling/random.yaml @@ -0,0 +1,5 @@ +do_sample: true +temperature: 0.7 +top_k: 50 +top_p: 0.7 +max_new_tokens: 1024 \ No newline at end of file diff --git a/configs/writer/llm/transformers.yaml b/configs/writer/llm/transformers.yaml new file mode 100644 index 0000000..f872cf0 --- /dev/null +++ b/configs/writer/llm/transformers.yaml @@ -0,0 +1,9 @@ +defaults: + - sampling: random + +_target_: panza3.llm.TransformersLLM +name: ${checkpoint} +checkpoint: ${checkpoint} +device: "cpu" +dtype: "fp32" +load_in_4bit: false \ No newline at end of file diff --git a/configs/writer/prompting/email_prompting.yaml b/configs/writer/prompting/email_prompting.yaml new file mode 100644 index 0000000..6dc5631 --- /dev/null +++ b/configs/writer/prompting/email_prompting.yaml @@ -0,0 +1,13 @@ +defaults: + - retriever: faiss + +_target_: panza3.prompting.EmailPromptBuilder + +system_preamble: ${load_preamble:${user.system_preamble_path}} +user_preamble: ${load_user_preamble:${user.user_preamble_path}} +rag_preamble: ${load_preamble:${user.rag_preamble_path}} +thread_preamble: ${load_preamble:${user.thread_preamble_path}} + +number_rag_emails: 3 +rag_relevance_threshold: 0.2 +number_thread_emails: 3 \ No newline at end of file diff --git a/configs/writer/prompting/retriever/faiss.yaml b/configs/writer/prompting/retriever/faiss.yaml new file mode 100644 index 0000000..9a00354 --- /dev/null +++ b/configs/writer/prompting/retriever/faiss.yaml @@ -0,0 +1,5 @@ +_target_: panza3.retriever.FaissRetriever +db_path: ${user.data_dir} +index_name: ${user.username} +embedding_model: ${embedding_model} +device: "cpu" \ No newline at end of file diff --git a/scripts/runner.py b/scripts/runner.py new file mode 100644 index 0000000..2254b43 --- /dev/null +++ b/scripts/runner.py @@ -0,0 +1,48 @@ +import logging + +import hydra +from omegaconf import DictConfig, OmegaConf + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers +from panza3.entities import EmailInstruction + +LOGGER = logging.getLogger(__name__) + + +def rename_config_keys(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + cfg.writer.llm.sampling_parameters = cfg.writer.llm.sampling + del cfg.writer.llm.sampling + + cfg.writer.prompt_builder = cfg.writer.prompting + del cfg.writer.prompting + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_writer") +def main(cfg: DictConfig) -> None: + # print(OmegaConf.to_yaml(cfg, resolve=True)) + LOGGER.info("Starting Panza Writer") + LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + + # Rename config keys to follow class structure + rename_config_keys(cfg) + + # Instantiate Panza writer + writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) + assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter" + + # TODO: Connect to CLI / GUI / webserver, etc. + output, prompt = writer.run( + instruction=EmailInstruction(instruction="Write an email."), return_prompt=True + ) + print("Prompt:", prompt) + print("Output:", output) + + +if __name__ == "__main__": + main() diff --git a/src/panza3/__init__.py b/src/panza3/__init__.py index e69de29..100281c 100644 --- a/src/panza3/__init__.py +++ b/src/panza3/__init__.py @@ -0,0 +1,10 @@ +from omegaconf import OmegaConf + +from .prompting.utils import load_preamble, load_user_preamble + +OmegaConf.register_new_resolver("load_preamble", load_preamble) +OmegaConf.register_new_resolver("load_user_preamble", load_user_preamble) + +from .writer import PanzaWriter + +__all__ = ["PanzaWriter"] From cd9b041d9587fdbecd86aa34eee27f0d7d0c83f2 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:02:27 +0200 Subject: [PATCH 021/112] rename ollama_llm.py to ollama.py --- src/panza3/llm/__init__.py | 2 +- src/panza3/llm/{ollama_llm.py => ollama.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/panza3/llm/{ollama_llm.py => ollama.py} (100%) diff --git a/src/panza3/llm/__init__.py b/src/panza3/llm/__init__.py index 20e4146..acca5fd 100644 --- a/src/panza3/llm/__init__.py +++ b/src/panza3/llm/__init__.py @@ -1,5 +1,5 @@ from .base import LLM, ChatHistoryType, MessageType from .local import PeftLLM, TransformersLLM -from .ollama_llm import OllamaLLM +from .ollama import OllamaLLM __all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM", "TransformersLLM", "PeftLLM"] diff --git a/src/panza3/llm/ollama_llm.py b/src/panza3/llm/ollama.py similarity index 100% rename from src/panza3/llm/ollama_llm.py rename to src/panza3/llm/ollama.py From 7e3ee433393fa7b1d24d79e53d6df564041002e6 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:07:42 +0200 Subject: [PATCH 022/112] add type annotations to OllamaLLM --- src/panza3/llm/ollama.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index 9676b2a..047c952 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -1,10 +1,11 @@ import os +from typing import Dict, Iterator, List import ollama -from .base import LLM +from .base import LLM, ChatHistoryType class OllamaLLM(LLM): - def __init__(self, name: str, gguf_file: str, sampling_params: dict): + def __init__(self, name: str, gguf_file: str, sampling_params: Dict): """ Loads and serves the model from the GGUF file into Ollama with the given name and sampling parameters. """ @@ -50,14 +51,14 @@ def _load_model(self): except: raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") - def _get_message(self, response): + def _get_message(self, response) -> str: return response['message']['content'] - def chat(self, messages): + def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: response = ollama.chat(model=self.name, messages=messages, stream=False) - return self._get_message(response) + return [self._get_message(response)] - def chat_stream(self, messages): + def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: stream = ollama.chat( model=self.name, messages=messages, From 8b1ef60708131b29b4335df5d31fc8035f90e8b4 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:09:48 +0200 Subject: [PATCH 023/112] add some more type annotations to OllamaLLM --- src/panza3/llm/ollama.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index 047c952..f7191f5 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -19,21 +19,21 @@ def __init__(self, name: str, gguf_file: str, sampling_params: Dict): if not self._is_model_loaded(): self._load_model() - def _is_ollama_running(self): + def _is_ollama_running(self) -> bool: try: ollama.list() return True except: return False - def _start_ollama(self): + def _start_ollama(self) -> None: # run the bash command "ollama list" which causes Ollama to start if it is not already running try: os.system("/bin/bash -c 'ollama list'") except: raise Exception("Ollama failed to start.") - def _is_model_loaded(self): + def _is_model_loaded(self) -> bool: for model in ollama.list()['models']: # model name is everything before the colon name = model['name'].split(":")[0] @@ -41,7 +41,7 @@ def _is_model_loaded(self): return True return False - def _load_model(self): + def _load_model(self) -> None: # TODO: Add sampling parameters to the model file modelfile = f""" FROM {self.gguf_file} From 73b37befda92760aa87d02a9d1567363fc71d498 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:11:08 +0200 Subject: [PATCH 024/112] check installation for OllamaLLM --- src/panza3/llm/ollama.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index f7191f5..610ff93 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -1,9 +1,15 @@ import os from typing import Dict, Iterator, List -import ollama - from .base import LLM, ChatHistoryType +_MISSING_LIBRARIES = [] + +try: + import ollama +except ImportError: + ollama = None + _MISSING_LIBRARIES.append("ollama") + class OllamaLLM(LLM): def __init__(self, name: str, gguf_file: str, sampling_params: Dict): """ @@ -53,6 +59,10 @@ def _load_model(self) -> None: def _get_message(self, response) -> str: return response['message']['content'] + + def _check_installation(self) -> None: + if ollama is None: + raise ImportError("The 'ollama' library is not installed. Please install it with 'pip install ollama'.") def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: response = ollama.chat(model=self.name, messages=messages, stream=False) From 90d59d15c92f5e3871b90ccd6a0616d6e15913ac Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:12:01 +0200 Subject: [PATCH 025/112] rename test_llm.py to test_local_llm.py --- tests/{test_llm.py => test_local_llm.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_llm.py => test_local_llm.py} (100%) diff --git a/tests/test_llm.py b/tests/test_local_llm.py similarity index 100% rename from tests/test_llm.py rename to tests/test_local_llm.py From f9ddb8f6077193bbd341bde9ecb1706d9a2d7f2e Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:27:39 +0200 Subject: [PATCH 026/112] add pytest to dev dependencies --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f50131..89f039d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,7 @@ where = ["src"] line-length = 100 [tool.pytest.ini_options] -pythonpath = ["src"] \ No newline at end of file +pythonpath = ["src"] + +[dev-dependencies] +pytest = "*" \ No newline at end of file From af88a2811631567b1e924d5d395c7454302e2cf2 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:29:49 +0200 Subject: [PATCH 027/112] add sampling_params to super() init call --- src/panza3/llm/ollama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index 610ff93..0cf173a 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -15,7 +15,7 @@ def __init__(self, name: str, gguf_file: str, sampling_params: Dict): """ Loads and serves the model from the GGUF file into Ollama with the given name and sampling parameters. """ - super().__init__(name) + super().__init__(name, sampling_params) self.gguf_file = gguf_file self.sampling_params = sampling_params From 28dce1797a43a2169a9ac328525481eef88c41d8 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:14:41 +0200 Subject: [PATCH 028/112] add unit tests for ollama_llm.py --- tests/test_ollama_llm.py | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tests/test_ollama_llm.py diff --git a/tests/test_ollama_llm.py b/tests/test_ollama_llm.py new file mode 100644 index 0000000..caf3106 --- /dev/null +++ b/tests/test_ollama_llm.py @@ -0,0 +1,70 @@ +from typing import Type +from unittest.mock import patch, MagicMock + +from panza3.llm.ollama import OllamaLLM +import pytest + +MODEL = "test_model" +GGUF_FILE = "test.gguf" +SAMPLING_PARAMS = {"param1": "val1"} +REQUEST = "write an email" +RESPONSE = "here is an email" +RESPONSE_OBJ = { + 'message': { + 'content': RESPONSE + } +} + +@patch('os.system') +@patch('ollama.list') +def test_ollama_llm_init_launches_ollama(ollama_list: MagicMock, os_system: MagicMock): + # When Ollama isn't running, the __init__() should start it by calling os.system(). To simulate Ollama not running, we'll mock the ollama.list() method to raise an exception. + ollama_list.side_effect = Exception("Ollama not running") + try: + OllamaLLM("test", "test.gguf", {}) + except: + pass + os_system.assert_called_once_with("/bin/bash -c 'ollama list'") + +@patch('ollama.create') +@patch('ollama.list') +def test_ollama_llm_init_creates_model(ollama_list: MagicMock, ollama_create: MagicMock): + # When the given module isn't loaded into Ollama yet, the __init__() should load it by calling ollama.create(). To simulate the module not being loaded, we'll mock the ollama.list() method to return an empty list. + ollama_list.return_value = {'models': []} + OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + ollama_create.assert_called_once() + +# Mock all external calls to prevent side effects +@patch('os.system') +@patch('ollama.list') +@patch('ollama.create') +def test_ollama_llm_init(*args): + # Make sure __init__() sets all local variables correctly + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + assert ollama_llm.gguf_file == GGUF_FILE + assert ollama_llm.sampling_params == SAMPLING_PARAMS + assert ollama_llm.name == MODEL + +# Mock all external calls to prevent side effects +@patch('os.system') +@patch('ollama.list') +@patch('ollama.create') +@patch('ollama.chat') +def test_ollama_llm_chat(ollama_chat: MagicMock, *args): + ollama_chat.return_value = RESPONSE_OBJ + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + assert ollama_llm.chat(REQUEST) == [RESPONSE] + ollama_chat.assert_called_once() + +# Mock all external calls to prevent side effects +@patch('os.system') +@patch('ollama.list') +@patch('ollama.create') +@patch('ollama.chat') +def test_ollama_llm_chat_stream(ollama_chat: MagicMock, *args): + expected_iterator = iter([RESPONSE_OBJ]) + ollama_chat.return_value = expected_iterator + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + # make sure that ollama_llm.chat() returns a generator that yields the expected response + assert list(ollama_llm.chat_stream(REQUEST)) == [RESPONSE] + ollama_chat.assert_called_once_with(model=MODEL, messages=REQUEST, stream=True) \ No newline at end of file From b8fba39bbe1bd70ad8ab6f2629551bcc922985b4 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:30:14 +0200 Subject: [PATCH 029/112] black formatting --- tests/test_ollama_llm.py | 93 ++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/tests/test_ollama_llm.py b/tests/test_ollama_llm.py index caf3106..86fd1b1 100644 --- a/tests/test_ollama_llm.py +++ b/tests/test_ollama_llm.py @@ -9,62 +9,63 @@ SAMPLING_PARAMS = {"param1": "val1"} REQUEST = "write an email" RESPONSE = "here is an email" -RESPONSE_OBJ = { - 'message': { - 'content': RESPONSE - } -} +RESPONSE_OBJ = {"message": {"content": RESPONSE}} -@patch('os.system') -@patch('ollama.list') + +@patch("os.system") +@patch("ollama.list") def test_ollama_llm_init_launches_ollama(ollama_list: MagicMock, os_system: MagicMock): - # When Ollama isn't running, the __init__() should start it by calling os.system(). To simulate Ollama not running, we'll mock the ollama.list() method to raise an exception. - ollama_list.side_effect = Exception("Ollama not running") - try: - OllamaLLM("test", "test.gguf", {}) - except: - pass - os_system.assert_called_once_with("/bin/bash -c 'ollama list'") + # When Ollama isn't running, the __init__() should start it by calling os.system(). To simulate Ollama not running, we'll mock the ollama.list() method to raise an exception. + ollama_list.side_effect = Exception("Ollama not running") + try: + OllamaLLM("test", "test.gguf", {}) + except: + pass + os_system.assert_called_once_with("/bin/bash -c 'ollama list'") + -@patch('ollama.create') -@patch('ollama.list') +@patch("ollama.create") +@patch("ollama.list") def test_ollama_llm_init_creates_model(ollama_list: MagicMock, ollama_create: MagicMock): - # When the given module isn't loaded into Ollama yet, the __init__() should load it by calling ollama.create(). To simulate the module not being loaded, we'll mock the ollama.list() method to return an empty list. - ollama_list.return_value = {'models': []} - OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - ollama_create.assert_called_once() + # When the given module isn't loaded into Ollama yet, the __init__() should load it by calling ollama.create(). To simulate the module not being loaded, we'll mock the ollama.list() method to return an empty list. + ollama_list.return_value = {"models": []} + OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + ollama_create.assert_called_once() + # Mock all external calls to prevent side effects -@patch('os.system') -@patch('ollama.list') -@patch('ollama.create') +@patch("os.system") +@patch("ollama.list") +@patch("ollama.create") def test_ollama_llm_init(*args): - # Make sure __init__() sets all local variables correctly - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - assert ollama_llm.gguf_file == GGUF_FILE - assert ollama_llm.sampling_params == SAMPLING_PARAMS - assert ollama_llm.name == MODEL + # Make sure __init__() sets all local variables correctly + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + assert ollama_llm.gguf_file == GGUF_FILE + assert ollama_llm.sampling_params == SAMPLING_PARAMS + assert ollama_llm.name == MODEL + # Mock all external calls to prevent side effects -@patch('os.system') -@patch('ollama.list') -@patch('ollama.create') -@patch('ollama.chat') +@patch("os.system") +@patch("ollama.list") +@patch("ollama.create") +@patch("ollama.chat") def test_ollama_llm_chat(ollama_chat: MagicMock, *args): - ollama_chat.return_value = RESPONSE_OBJ - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - assert ollama_llm.chat(REQUEST) == [RESPONSE] - ollama_chat.assert_called_once() + ollama_chat.return_value = RESPONSE_OBJ + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + assert ollama_llm.chat(REQUEST) == [RESPONSE] + ollama_chat.assert_called_once() + # Mock all external calls to prevent side effects -@patch('os.system') -@patch('ollama.list') -@patch('ollama.create') -@patch('ollama.chat') +@patch("os.system") +@patch("ollama.list") +@patch("ollama.create") +@patch("ollama.chat") def test_ollama_llm_chat_stream(ollama_chat: MagicMock, *args): - expected_iterator = iter([RESPONSE_OBJ]) - ollama_chat.return_value = expected_iterator - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - # make sure that ollama_llm.chat() returns a generator that yields the expected response - assert list(ollama_llm.chat_stream(REQUEST)) == [RESPONSE] - ollama_chat.assert_called_once_with(model=MODEL, messages=REQUEST, stream=True) \ No newline at end of file + expected_iterator = iter([RESPONSE_OBJ]) + ollama_chat.return_value = expected_iterator + ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) + # make sure that ollama_llm.chat() returns a generator that yields the expected response + assert list(ollama_llm.chat_stream(REQUEST)) == [RESPONSE] + ollama_chat.assert_called_once_with(model=MODEL, messages=REQUEST, stream=True) From 5f05228af4d553238f74b13d09f1eaf219b6fb69 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:34:46 +0200 Subject: [PATCH 030/112] fix types --- tests/test_ollama_llm.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_ollama_llm.py b/tests/test_ollama_llm.py index 86fd1b1..41d1a4c 100644 --- a/tests/test_ollama_llm.py +++ b/tests/test_ollama_llm.py @@ -1,14 +1,15 @@ -from typing import Type +from typing import Dict, Type from unittest.mock import patch, MagicMock from panza3.llm.ollama import OllamaLLM +from panza3.llm import MessageType import pytest -MODEL = "test_model" -GGUF_FILE = "test.gguf" -SAMPLING_PARAMS = {"param1": "val1"} -REQUEST = "write an email" -RESPONSE = "here is an email" +MODEL: str = "test_model" +GGUF_FILE: str = "test.gguf" +SAMPLING_PARAMS: Dict = {"param1": "val1"} +REQUEST: MessageType = {"content": "write an email"} +RESPONSE: str = "here is an email" RESPONSE_OBJ = {"message": {"content": RESPONSE}} From 91ef88250769351eb36ed313cec3480bc5d958e7 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:36:33 +0200 Subject: [PATCH 031/112] add to gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8d970b7..317816a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,7 @@ checkpoints/ results/ wandb/ -*.log \ No newline at end of file +*.log +*.egg-info +.vscode +build/ \ No newline at end of file From d4d0a951046b149ffebacec766b410dee924d7cc Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:36:40 +0200 Subject: [PATCH 032/112] black formatting --- src/panza3/hosting/web_service.py | 20 +++++++++++--------- src/panza3/llm/ollama.py | 21 ++++++++++++--------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/panza3/hosting/web_service.py b/src/panza3/hosting/web_service.py index e41c48a..1893bd2 100644 --- a/src/panza3/hosting/web_service.py +++ b/src/panza3/hosting/web_service.py @@ -9,9 +9,11 @@ from dotenv import load_dotenv import threading + class Request(BaseModel): text: str + class PanzaWebService: DEFAULT_PORT = 5001 @@ -35,11 +37,11 @@ def _add_cors(self): def _get_valid_api_keys(self) -> List[str]: return os.getenv("API_KEYS").split(",") - + def _streamer(self, stream): - for chunk in stream: - yield chunk["message"]["content"] - + for chunk in stream: + yield chunk["message"]["content"] + def _predict(self, input: str) -> Generator: # TODO: Call PanzaWriter here # Dummy generator @@ -47,16 +49,16 @@ def _predict(self, input: str) -> Generator: yield {"message": {"content": f"Generated text {i}"}} def _setup_routes(self): - @self.app.options('/generate') + @self.app.options("/generate") def options(): return {"methods": ["POST"]} - - @self.app.post('/generate') + + @self.app.post("/generate") def generate_text(request: Request, x_api_key: Annotated[str | None, Header()]): if x_api_key not in self.api_keys: raise HTTPException(status_code=401, detail="Invalid API key.") stream = self._predict(request.text) - return StreamingResponse(self._streamer(stream), media_type='text/event-stream') + return StreamingResponse(self._streamer(stream), media_type="text/event-stream") def _start_server(self): - uvicorn.run(self.app, port=self.port) \ No newline at end of file + uvicorn.run(self.app, port=self.port) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index 0cf173a..75806a9 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -10,6 +10,7 @@ ollama = None _MISSING_LIBRARIES.append("ollama") + class OllamaLLM(LLM): def __init__(self, name: str, gguf_file: str, sampling_params: Dict): """ @@ -31,7 +32,7 @@ def _is_ollama_running(self) -> bool: return True except: return False - + def _start_ollama(self) -> None: # run the bash command "ollama list" which causes Ollama to start if it is not already running try: @@ -40,13 +41,13 @@ def _start_ollama(self) -> None: raise Exception("Ollama failed to start.") def _is_model_loaded(self) -> bool: - for model in ollama.list()['models']: + for model in ollama.list()["models"]: # model name is everything before the colon - name = model['name'].split(":")[0] + name = model["name"].split(":")[0] if name == self.name: return True return False - + def _load_model(self) -> None: # TODO: Add sampling parameters to the model file modelfile = f""" @@ -56,13 +57,15 @@ def _load_model(self) -> None: ollama.create(model={self.name}, modelfile=modelfile, stream=True) except: raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") - + def _get_message(self, response) -> str: - return response['message']['content'] - + return response["message"]["content"] + def _check_installation(self) -> None: if ollama is None: - raise ImportError("The 'ollama' library is not installed. Please install it with 'pip install ollama'.") + raise ImportError( + "The 'ollama' library is not installed. Please install it with 'pip install ollama'." + ) def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: response = ollama.chat(model=self.name, messages=messages, stream=False) @@ -76,4 +79,4 @@ def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: ) # return a new stream that only contains the message content for chunk in stream: - yield self._get_message(chunk) \ No newline at end of file + yield self._get_message(chunk) From a9f915260d74b0cfac58c0bc8c1d06c65bcc009e Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:46:23 +0200 Subject: [PATCH 033/112] add omegaconf to dependencies --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 89f039d..c79d5b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,8 @@ dependencies = [ "ollama", "llm-foundry@git+https://github.com/IST-DASLab/llm-foundry", "peft@git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant_looser_versioning", - "spops-sm-80" + "spops-sm-80", + "omegaconf", ] [build-system] From 49dfb1d769b9d52254cb868e3c114a8f5a0c8bd0 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Sun, 1 Sep 2024 17:59:08 +0200 Subject: [PATCH 034/112] Add FFT runner --- configs/base.yaml | 5 + .../finetuning/base.yaml | 52 ++--- configs/finetuning/full.yaml | 21 ++ configs/finetuning/rosa.yaml | 20 ++ configs/panza_finetuning.yaml | 18 ++ configs/panza_writer.yaml | 3 +- scripts/finetune.py | 197 ++++++++++++++++++ scripts/runner.py | 1 - src/panza/finetuning/configs/fft_panza.yaml | 93 --------- .../configs/mistral_7b_fft_panza.yaml | 107 ---------- .../configs/mistral_7b_rosa_panza.yaml | 113 ---------- .../finetuning/configs/rosa_panza_colab.yaml | 95 --------- src/panza/finetuning/preprocessing.py | 143 ------------- src/panza3/finetuning/preprocessing.py | 51 +++++ src/{panza => panza3}/finetuning/train.py | 70 +++---- 15 files changed, 360 insertions(+), 629 deletions(-) rename src/panza/finetuning/configs/rosa_panza.yaml => configs/finetuning/base.yaml (50%) create mode 100644 configs/finetuning/full.yaml create mode 100644 configs/finetuning/rosa.yaml create mode 100644 configs/panza_finetuning.yaml create mode 100644 scripts/finetune.py delete mode 100644 src/panza/finetuning/configs/fft_panza.yaml delete mode 100644 src/panza/finetuning/configs/mistral_7b_fft_panza.yaml delete mode 100644 src/panza/finetuning/configs/mistral_7b_rosa_panza.yaml delete mode 100644 src/panza/finetuning/configs/rosa_panza_colab.yaml delete mode 100644 src/panza/finetuning/preprocessing.py create mode 100644 src/panza3/finetuning/preprocessing.py rename src/{panza => panza3}/finetuning/train.py (97%) diff --git a/configs/base.yaml b/configs/base.yaml index 2d5ba57..0fb1a10 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,2 +1,7 @@ panza_workspace: ${hydra:runtime.cwd}/../ +checkpoint_dir: ${panza_workspace}/checkpoints seed: 42 + +embedding_model: "sentence-transformers/all-mpnet-base-v2" + +wandb_disabled: false \ No newline at end of file diff --git a/src/panza/finetuning/configs/rosa_panza.yaml b/configs/finetuning/base.yaml similarity index 50% rename from src/panza/finetuning/configs/rosa_panza.yaml rename to configs/finetuning/base.yaml index f2cd3b7..d62ad00 100644 --- a/src/panza/finetuning/configs/rosa_panza.yaml +++ b/configs/finetuning/base.yaml @@ -1,50 +1,34 @@ max_seq_len: 512 -global_seed: 17 -model_name_or_path: #TODO +global_seed: ${seed} +model_name_or_path: ${model} load_path: # set via bash script to be absolute path to your sparse checkpoint precision: amp_bf16 -hf_save_path: ./checkpoints +hf_save_path: ${checkpoint_dir}/models -max_duration: # TODO +max_duration: ${num_epochs}ep eval_interval: 1 -seed: ${global_seed} +seed: ${seed} -global_train_batch_size: #TODO -device_train_microbatch_size: 16 -device_eval_batch_size: 16 +global_train_batch_size: ${batch_size} +device_train_microbatch_size: 1 +device_eval_batch_size: 1 -run_name: # If left blank, will be read from env var $RUN_NAME +run_name: # If left blank, it will be generated based on configs model: name: hf_causal_lm pretrained: true - pretrained_model_name_or_path: ${model_name_or_path} - max_seq_len: ${max_seq_len} + pretrained_model_name_or_path: ${finetuning.model_name_or_path} + max_seq_len: ${finetuning.max_seq_len} output_hidden_states: true - weight_bias_dtype: #TODO + weight_bias_dtype: ${model_precision} compute_dtype: bf16 -rosa: - lora_r: #TODO - spa_d: #TODO - lora_alpha: 16 - target_modules: 'all-linear' - lora_dropout: 0.05 - impl: auto - spa_store_transpose: true - rosa_dtype: bf16 - spa_num_grads: 1 - grad_acc_mode: mean_squared - mask_load_path: #TODO - mask_save_path: #TODO - terminate_after_mask_generation: #TODO - schedule: #TODO - tokenizer: - name: ${model_name_or_path} + name: ${finetuning.model_name_or_path} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${finetuning.max_seq_len} train_loader: name: finetuning @@ -52,9 +36,9 @@ train_loader: hf_name: json split: train hf_kwargs: - data_files: #TODO - preprocessing_fn: preprocessing:panza_preprocessing_function - max_seq_len: ${max_seq_len} + data_files: ${user.data_dir}/train.jsonl + preprocessing_fn: panza3.finetuning.preprocessing:panza_preprocessing_function + max_seq_len: ${finetuning.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -72,7 +56,7 @@ scheduler: optimizer: name: decoupled_adamw - lr: # TODO + lr: ${lr} betas: - 0.9 - 0.999 diff --git a/configs/finetuning/full.yaml b/configs/finetuning/full.yaml new file mode 100644 index 0000000..27e90bb --- /dev/null +++ b/configs/finetuning/full.yaml @@ -0,0 +1,21 @@ +defaults: + - base + +fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: FULL + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + verbose: false + +callbacks: + hf_checkpointer: + overwrite: true + precision: # TODO + save_folder: ${finetuning.hf_save_path}/${finetuning.run_name} + save_interval: 1dur + +scheduler: + t_warmup: 20ba diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml new file mode 100644 index 0000000..20d2b4b --- /dev/null +++ b/configs/finetuning/rosa.yaml @@ -0,0 +1,20 @@ +rosa: + lora_lr: ${lr} + lora_r: 8 + spa_d: #TODO + lora_alpha: 16 + target_modules: 'all-linear' + lora_dropout: 0.05 + impl: auto + spa_store_transpose: true + rosa_dtype: bf16 + spa_num_grads: 1 + grad_acc_mode: mean_squared + mask_load_path: #TODO + mask_save_path: #TODO + terminate_after_mask_generation: #TODO + schedule: #TODO + mask_gen_model_precision: #TODO + +scheduler: + t_warmup: 8ba \ No newline at end of file diff --git a/configs/panza_finetuning.yaml b/configs/panza_finetuning.yaml new file mode 100644 index 0000000..0e05413 --- /dev/null +++ b/configs/panza_finetuning.yaml @@ -0,0 +1,18 @@ +defaults: + - base + - user: default + - finetuning: full + - writer/prompting/email_prompting@preprocessing.prompting + + +model: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" +num_epochs: 1 +lr: 1e-4 +batch_size: 8 +model_precision: bf16 # bf16 or fp32 + +preprocessing: + model: ${model} + prompting: + number_rag_emails: 0 + number_thread_emails: 0 \ No newline at end of file diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml index 5de1f0c..13fede3 100644 --- a/configs/panza_writer.yaml +++ b/configs/panza_writer.yaml @@ -3,5 +3,4 @@ defaults: - writer: email - user: default -embedding_model: "sentence-transformers/all-mpnet-base-v2" -checkpoint: "microsoft/Phi-3-mini-4k-instruct" +checkpoint: "microsoft/Phi-3-mini-4k-instruct" \ No newline at end of file diff --git a/scripts/finetune.py b/scripts/finetune.py new file mode 100644 index 0000000..fb6f51a --- /dev/null +++ b/scripts/finetune.py @@ -0,0 +1,197 @@ +import codecs +import logging +import os +import pty +import random +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import hydra +import psutil +import torch +from omegaconf import DictConfig, OmegaConf + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers + +LOGGER = logging.getLogger(__name__) + + +def create_run_name(cfg: DictConfig) -> str: + # export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM + + run_name = f"panza_{cfg.user.username}" + + model_name = cfg.model.split("/")[-1] + run_name += f"-{model_name}" + + run_name += f"-{cfg.model_precision}" + run_name += f"-bs{cfg.batch_size}" + + if hasattr(cfg.finetuning, "rosa"): + run_name += "-rosa" + else: + run_name += "-fft" + + run_name += f"-lr{cfg.lr}" + run_name += f"-epochs{cfg.num_epochs}" + run_name += f"-seed{cfg.seed}" + run_name += f"-{random.randint(1e6, 1e7 - 1)}" + + return run_name + + +def determine_rosa_schedule(cfg: DictConfig) -> str: + pass + + +def create_experiment_yaml() -> str: + pass + + +def create_checkpoint_dirs(cfg: DictConfig) -> None: + # Create model directory + os.makedirs(cfg.finetuning.hf_save_path) + + # Create mask directory + if hasattr(cfg.finetuning, "rosa"): + os.makedirs(cfg.finetuning.rosa.mask_save_path) + + +def get_hf_save_precision(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bfloat16" + elif cfg.model_precision == "fp32": + return "fp32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def get_rosa_dtype(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bg16" + elif cfg.model_precision == "fp32": + return "fp32" + elif cfg.model_precision == "4bit": + return "fp32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def override_config(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + cfg.finetuning.run_name = create_run_name(cfg) + + if hasattr(cfg.finetuning, "rosa"): + pass + else: + cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +def save_config_to_yaml(cfg: DictConfig) -> str: + cfg = OmegaConf.to_container(cfg, resolve=True) + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yaml") as temp_file: + OmegaConf.save(config=cfg, f=temp_file.name) + return temp_file.name + + +def launch_experiment(cfg: DictConfig, finetuning_yaml: str, prompt_builder_yaml: str) -> None: + def terminate_process_tree(pid: str): + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + print("Terminating child process", child) + child.terminate() + psutil.wait_procs(children, timeout=5) + print("Terminating parent process", parent) + parent.terminate() + parent.wait(5) + except psutil.NoSuchProcess: + pass + + def move_checkpoint_files(cfg: DictConfig) -> None: + # Move checkpoint files to the final directory + run_save_path = Path(cfg.hf_save_path) / "models" / cfg.run_name + huggingface_dir = run_save_path / "huggingface" + last_save_dir_name = max(huggingface_dir.iterdir(), key=os.path.getmtime).name + + # Move the contents of the last saved directory to the run save path + source_dir = huggingface_dir / last_save_dir_name + for item in source_dir.iterdir(): + shutil.move(str(item), run_save_path) + + # Remove the now-empty huggingface directory + shutil.rmtree(huggingface_dir) + + train_script = os.path.join(cfg.panza_workspace, "src/panza3/finetuning/train.py") + environment = os.environ.copy() + environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") + environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" + environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) + environment["PANZA_PREPROCESSING_CONFIG"] = prompt_builder_yaml + + print(finetuning_yaml) + print(train_script) + print(environment["PYTHONPATH"]) + command = f"composer {train_script} {finetuning_yaml}" + master, slave = pty.openpty() # Open a pseudo-terminal + with subprocess.Popen( + command, + stdout=slave, + stderr=subprocess.STDOUT, + text=True, + env=environment, + preexec_fn=os.setsid, + shell=True, + ) as process: + os.close(slave) # Close the slave descriptor + + # Set up a stream reader for the master end of the pty + try: + with codecs.getreader("utf-8")(os.fdopen(master, "rb")) as reader: + # Read and process output line by line + for line in reader: + print(line, end="") + + return process.returncode + except KeyboardInterrupt: + print("Killing process") + # os.killpg(os.getpgid(process.pid), subprocess.signal.SIGTERM) + terminate_process_tree(process.pid) + torch.cuda.empty_cache() + time.sleep(3) # Give some time for GPU resources to be released + + if not hasattr(cfg.finetuning, "rosa"): + move_checkpoint_files(cfg) + + print("Find the finetuned model at", os.path.join(cfg.hf_save_path, "models", cfg.run_name)) + + +@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_finetuning") +def main(cfg: DictConfig) -> None: + LOGGER.info("Starting Panza Finetuning") + LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + + # Override configuration + override_config(cfg) + + # Launch training + if "rosa" in cfg.finetuning: + pass + else: + finetuning_yaml = save_config_to_yaml(cfg.finetuning) + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) + + +if __name__ == "__main__": + main() diff --git a/scripts/runner.py b/scripts/runner.py index 2254b43..27ee94e 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -25,7 +25,6 @@ def rename_config_keys(cfg: DictConfig) -> None: @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_writer") def main(cfg: DictConfig) -> None: - # print(OmegaConf.to_yaml(cfg, resolve=True)) LOGGER.info("Starting Panza Writer") LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) diff --git a/src/panza/finetuning/configs/fft_panza.yaml b/src/panza/finetuning/configs/fft_panza.yaml deleted file mode 100644 index 7c2e128..0000000 --- a/src/panza/finetuning/configs/fft_panza.yaml +++ /dev/null @@ -1,93 +0,0 @@ -max_seq_len: 512 -global_seed: 17 -model_name_or_path: #TODO - -load_path: # set via bash script to be absolute path to your sparse checkpoint -precision: amp_bf16 -hf_save_path: ./checkpoints - -max_duration: # TODO -eval_interval: 1 -# eval_first: false -seed: ${global_seed} - -global_train_batch_size: #TODO -device_train_microbatch_size: 16 -device_eval_batch_size: 16 - -run_name: # If left blank, will be read from env var $RUN_NAME - -model: - name: hf_causal_lm - pretrained: true - pretrained_model_name_or_path: ${model_name_or_path} - max_seq_len: ${max_seq_len} - output_hidden_states: true - weight_bias_dtype: #TODO - compute_dtype: bf16 - -tokenizer: - name: ${model_name_or_path} - kwargs: - model_max_length: ${max_seq_len} - -train_loader: - name: finetuning - dataset: - hf_name: json - split: train - hf_kwargs: - data_files: #TODO - preprocessing_fn: preprocessing:panza_preprocessing_function - max_seq_len: ${max_seq_len} - allow_pad_trimming: false - decoder_only_format: true - shuffle: true - drop_last: false - num_workers: 8 - pin_memory: false - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - -scheduler: - name: linear_decay_with_warmup - t_warmup: 20ba - alpha_f: 0 - -optimizer: - name: decoupled_adamw - lr: # TODO - betas: - - 0.9 - - 0.999 - eps: 1.0e-8 - weight_decay: 0.0 - -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: FULL - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true - verbose: false - -progress_bar: false -log_to_console: true -console_log_interval: 1ba - -callbacks: - speed_monitor: - window_size: 10 - lr_monitor: { } - memory_monitor: { } - runtime_estimator: { } - hf_checkpointer: - overwrite: true - precision: # TODO - save_folder: ${hf_save_path}/${run_name} - save_interval: 1dur - -loggers: - wandb: { } diff --git a/src/panza/finetuning/configs/mistral_7b_fft_panza.yaml b/src/panza/finetuning/configs/mistral_7b_fft_panza.yaml deleted file mode 100644 index 1874b7e..0000000 --- a/src/panza/finetuning/configs/mistral_7b_fft_panza.yaml +++ /dev/null @@ -1,107 +0,0 @@ -# This config trains lora and spa the whole time, which means it restarts the training after grad collection. - -max_seq_len: 512 -global_seed: 17 -model_name_or_path: #TODO - -load_path: # set via bash script to be absolute path to your sparse checkpoint -precision: amp_bf16 -hf_save_path: ./checkpoints - -max_duration: # TODO -eval_interval: 1 -# eval_first: false -seed: ${global_seed} - -global_train_batch_size: #TODO -# for mpt-7b dense: -# 4 x A100_80GB = "device_train_microbatch_size: 12" -# 8 x A6000_48GB = "device_train_microbatch_size: 6" - -# for mpt-7b sparse (with masks): -# 8 x A6000_48GB = "device_train_microbatch_size: 4" -device_train_microbatch_size: 16 -device_eval_batch_size: 16 - -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME - -model: - name: hf_causal_lm - pretrained: true - pretrained_model_name_or_path: ${model_name_or_path} - max_seq_len: ${max_seq_len} - output_hidden_states: true - weight_bias_dtype: #TODO - compute_dtype: bf16 - # config_overrides: - # attn_config: - # attn_impl: torch - # Set this to `true` if using `train_loader.dataset.packing_ratio` below - # attn_uses_sequence_id: true - -# Tokenizer -tokenizer: - name: ${model_name_or_path} - kwargs: - model_max_length: ${max_seq_len} - -# Dataloaders -train_loader: - name: finetuning - dataset: - hf_name: json - split: train - hf_kwargs: - data_files: #TODO - preprocessing_fn: preprocessing:panza_preprocessing_function - max_seq_len: ${max_seq_len} - allow_pad_trimming: false - decoder_only_format: true - shuffle: true - drop_last: false - num_workers: 8 - pin_memory: false - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - -# Optimization -scheduler: - name: linear_decay_with_warmup - t_warmup: 20ba - alpha_f: 0 - -optimizer: - name: decoupled_adamw - lr: # TODO - betas: - - 0.9 - - 0.999 - eps: 1.0e-8 - weight_decay: 0.0 - -# FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: FULL - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true - verbose: false - -# Logging -progress_bar: false -log_to_console: true -console_log_interval: 1ba - -callbacks: - speed_monitor: - window_size: 10 - lr_monitor: { } - memory_monitor: { } - runtime_estimator: { } - -loggers: - wandb: { } diff --git a/src/panza/finetuning/configs/mistral_7b_rosa_panza.yaml b/src/panza/finetuning/configs/mistral_7b_rosa_panza.yaml deleted file mode 100644 index fa93a7b..0000000 --- a/src/panza/finetuning/configs/mistral_7b_rosa_panza.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# This config trains lora and spa the whole time, which means it restarts the training after grad collection. - -max_seq_len: 512 -global_seed: 17 -model_name_or_path: #TODO - -load_path: # set via bash script to be absolute path to your sparse checkpoint -precision: amp_bf16 -hf_save_path: ./checkpoints - -max_duration: # TODO -eval_interval: 1 -# eval_first: false -seed: ${global_seed} - -global_train_batch_size: #TODO -# for mpt-7b dense: -# 4 x A100_80GB = "device_train_microbatch_size: 12" -# 8 x A6000_48GB = "device_train_microbatch_size: 6" - -# for mpt-7b sparse (with masks): -# 8 x A6000_48GB = "device_train_microbatch_size: 4" -device_train_microbatch_size: 16 -device_eval_batch_size: 16 - -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME - -model: - name: hf_causal_lm - pretrained: true - pretrained_model_name_or_path: ${model_name_or_path} - max_seq_len: ${max_seq_len} - output_hidden_states: true - weight_bias_dtype: #TODO - compute_dtype: bf16 - # config_overrides: - # attn_config: - # attn_impl: torch - # Set this to `true` if using `train_loader.dataset.packing_ratio` below - # attn_uses_sequence_id: true - -rosa: - lora_r: #TODO - spa_d: #TODO - lora_alpha: 16 - target_modules: 'all-linear' - lora_dropout: 0.05 - impl: auto - spa_store_transpose: true - rosa_dtype: bf16 - spa_num_grads: 1 - grad_acc_mode: mean_squared - mask_load_path: #TODO - mask_save_path: #TODO - terminate_after_mask_generation: #TODO - schedule: #TODO - -# Tokenizer -tokenizer: - name: ${model_name_or_path} - kwargs: - model_max_length: ${max_seq_len} - -# Dataloaders -train_loader: - name: finetuning - dataset: - hf_name: json - split: train - hf_kwargs: - data_files: #TODO - preprocessing_fn: preprocessing:panza_preprocessing_function - max_seq_len: ${max_seq_len} - allow_pad_trimming: false - decoder_only_format: true - shuffle: true - drop_last: false - num_workers: 8 - pin_memory: false - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - -# Optimization -scheduler: - name: linear_decay_with_warmup - t_warmup: 20ba - alpha_f: 0 - -optimizer: - name: decoupled_adamw - lr: # TODO - betas: - - 0.9 - - 0.999 - eps: 1.0e-8 - weight_decay: 0.0 - -# Logging -progress_bar: false -log_to_console: true -console_log_interval: 1ba - -callbacks: - speed_monitor: - window_size: 10 - lr_monitor: { } - memory_monitor: { } - runtime_estimator: { } - -loggers: - wandb: { } diff --git a/src/panza/finetuning/configs/rosa_panza_colab.yaml b/src/panza/finetuning/configs/rosa_panza_colab.yaml deleted file mode 100644 index 0c292b1..0000000 --- a/src/panza/finetuning/configs/rosa_panza_colab.yaml +++ /dev/null @@ -1,95 +0,0 @@ -max_seq_len: 512 -global_seed: 17 -model_name_or_path: #TODO - -load_path: # set via bash script to be absolute path to your sparse checkpoint -precision: fp32 -hf_save_path: ./checkpoints - -max_duration: # TODO -eval_interval: 1 -seed: ${global_seed} - -global_train_batch_size: #TODO -device_train_microbatch_size: 16 -device_eval_batch_size: 16 - -run_name: # If left blank, will be read from env var $RUN_NAME - -model: - name: hf_causal_lm - pretrained: true - pretrained_model_name_or_path: ${model_name_or_path} - max_seq_len: ${max_seq_len} - output_hidden_states: true - weight_bias_dtype: #TODO - compute_dtype: fp32 - -rosa: - lora_r: #TODO - spa_d: #TODO - lora_alpha: 16 - target_modules: 'all-linear' - lora_dropout: 0.05 - impl: auto - spa_store_transpose: true - rosa_dtype: fp32 - spa_num_grads: 1 - grad_acc_mode: mean_squared - grad_4bit_accum: true - mask_load_path: #TODO - mask_save_path: #TODO - terminate_after_mask_generation: #TODO - schedule: #TODO - -tokenizer: - name: ${model_name_or_path} - kwargs: - model_max_length: ${max_seq_len} - -train_loader: - name: finetuning - dataset: - hf_name: json - split: train - hf_kwargs: - data_files: #TODO - preprocessing_fn: preprocessing:panza_preprocessing_function - max_seq_len: ${max_seq_len} - allow_pad_trimming: false - decoder_only_format: true - shuffle: true - drop_last: false - num_workers: 8 - pin_memory: false - prefetch_factor: 2 - persistent_workers: true - timeout: 0 - -scheduler: - name: linear_decay_with_warmup - t_warmup: 20ba - alpha_f: 0 - -optimizer: - name: decoupled_adamw - lr: # TODO - betas: - - 0.9 - - 0.999 - eps: 1.0e-8 - weight_decay: 0.0 - -progress_bar: false -log_to_console: true -console_log_interval: 1ba - -callbacks: - speed_monitor: - window_size: 10 - lr_monitor: { } - memory_monitor: { } - runtime_estimator: { } - -loggers: - wandb: { } diff --git a/src/panza/finetuning/preprocessing.py b/src/panza/finetuning/preprocessing.py deleted file mode 100644 index 1e29219..0000000 --- a/src/panza/finetuning/preprocessing.py +++ /dev/null @@ -1,143 +0,0 @@ -import os -import random -from typing import Dict, List, Tuple - -from langchain_core.documents import Document - -from panza.utils import prompting, rag -from panza.utils.documents import Email - -SYSTEM_PREAMBLE_PATH = os.environ.get("PANZA_SYSTEM_PREAMBLE_PATH") -USER_PREAMBLE_PATH = os.environ.get("PANZA_USER_PREAMBLE_PATH") - -SYSTEM_PREAMBLE = prompting.load_preamble(SYSTEM_PREAMBLE_PATH) -USER_PREAMBLE = prompting.load_user_preamble(USER_PREAMBLE_PATH) - -PANZA_GENERATIVE_MODEL = os.environ.get("PANZA_GENERATIVE_MODEL") -PROMPT_START_WRAPPER, PROMPT_END_WRAPPER, RESPONSE_START_WRAPPER, RESPONSE_END_WRAPPER = ( - prompting.get_model_special_tokens(PANZA_GENERATIVE_MODEL) -) - -PANZA_FINETUNE_WITH_RAG = int(os.environ.get("PANZA_FINETUNE_WITH_RAG")) == 1 -if PANZA_FINETUNE_WITH_RAG: - EMBEDDINGS_MODEL = os.environ.get("PANZA_EMBEDDING_MODEL") - DB_PATH = os.environ.get("PANZA_DATA_DIR") - INDEX_NAME = os.environ.get("PANZA_USERNAME") - EMBEDDINGS_MODEL = rag.get_embeddings_model(EMBEDDINGS_MODEL) - DB = rag.load_vector_db_from_disk(DB_PATH, INDEX_NAME, EMBEDDINGS_MODEL) - RAG_PREAMBLE_PATH = os.environ.get("PANZA_RAG_PREAMBLE_PATH") - RAG_PREAMBLE = prompting.load_preamble(RAG_PREAMBLE_PATH) - RAG_NUM_EMAILS = int(os.environ.get("PANZA_FINETUNE_RAG_NUM_EMAILS")) - RAG_PROB = float(os.environ.get("PANZA_FINETUNE_RAG_PROB")) - RAG_RELEVANCE_THRESHOLD = float(os.environ.get("PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD")) - PANZA_SEED = int(os.environ.get("PANZA_SEED")) - random.seed(PANZA_SEED) - -PANZA_FINETUNE_WITH_THREAD = int(os.environ.get("PANZA_FINETUNE_WITH_THREAD")) == 1 -if PANZA_FINETUNE_WITH_THREAD: - THREAD_PREAMBLE_PATH = os.environ.get("PANZA_THREAD_PREAMBLE_PATH") - THREAD_PREAMBLE = prompting.load_preamble(THREAD_PREAMBLE_PATH) - THREAD_NUM_EMAILS = int(os.environ.get("PANZA_FINETUNE_THREAD_NUM_EMAILS")) - -r"""Example custom preprocessing function. - -This is here to help illustrate the way to set up finetuning -on a local dataset. One step of that process is to create -a preprocessing function for your dataset, and that is what -is done below. Check out the LLM Finetuning section of -`../README.md` for more context. - -For this example, we're going to pretend that our local dataset -is `./train.jsonl`. - -Note: this dataset is actually a copy of one of our ARC-Easy -multiple-choice ICL eval datasets. And you would never actually -train on eval data! ... But this is just a demonstration. - -Every example within the dataset has the format: -{ - 'query': , - 'choices': [, , ...], - 'gold': # index of correct choice -} - -To enable finetuning, we want to turn this into a prompt/response -format. We'll structure prompts and responses like this: -{ - 'prompt': \nOptions:\n - \n - \nAnswer: , - 'response': -} -""" - - -def filter_relevant_emails(relevant_emails_with_score: List[Tuple[Email, float]]) -> List[Email]: - # Random chance to not include any relevant emails - p = random.random() - if p > RAG_PROB: - relevant_emails = [] - print("Skip RAG") - return relevant_emails - - if not relevant_emails: - print("Relevant emails not found.") - return [] - - print("Don't skip") - relevant_emails = [r["email"] for r in relevant_emails if r["score"] >= RAG_RELEVANCE_THRESHOLD] - relevant_emails = [Document(page_content=email, metadata={}) for email in relevant_emails] - relevant_emails = relevant_emails[:RAG_NUM_EMAILS] - print(f"Found {len(relevant_emails)} relevant emails.") - return relevant_emails - - -def panza_preprocessing_function(inp: Dict) -> Dict: - try: - prompt_raw = inp["summary"].split("\n\nInstruction: ")[-1] - return { - "prompt": PROMPT_START_WRAPPER + prompt_raw + PROMPT_END_WRAPPER, - "response": RESPONSE_START_WRAPPER + inp["email"] + RESPONSE_END_WRAPPER, - } - except Exception as e: - raise ValueError(f"Unable to extract prompt/response from {inp}") from e - - -def panza_preprocessing_function_train_with_preamble(inp: Dict) -> Dict: - try: - prompt_raw = inp["summary"].split("\n\nInstruction: ")[-1] - if PANZA_FINETUNE_WITH_RAG: - relevant_emails_with_score = inp.get("relevant_emails", []) - relevant_emails_with_score = [ - (Email.deserialize(email), score) for (email, score) in relevant_emails_with_score - ] - relevant_emails = filter_relevant_emails(relevant_emails_with_score) - prompt = prompting.create_prompt( - prompt_raw, SYSTEM_PREAMBLE, USER_PREAMBLE, RAG_PREAMBLE, relevant_emails - ) - print(prompt) - else: - prompt = prompting.create_prompt(prompt_raw, SYSTEM_PREAMBLE, USER_PREAMBLE) - return { - "prompt": PROMPT_START_WRAPPER + prompt + PROMPT_END_WRAPPER, - "response": RESPONSE_START_WRAPPER + inp["email"] + RESPONSE_END_WRAPPER, - } - except Exception as e: - raise ValueError(f"Unable to extract prompt/response from {inp}") from e - - -def panza_preprocessing_function_train_with_thread(inp: Dict) -> Dict: - try: - prompt_raw = inp["summary"].split("\n\nInstruction: ")[-1] - if PANZA_FINETUNE_WITH_THREAD: - thread = inp.get("thread", []) - thread = thread[:THREAD_NUM_EMAILS] - prompt = prompting.create_prompt( - prompt_raw, SYSTEM_PREAMBLE, USER_PREAMBLE, thread_preamble=THREAD_PREAMBLE, thread_emails=thread - ) - else: - prompt = prompting.create_prompt(prompt_raw, SYSTEM_PREAMBLE, USER_PREAMBLE) - return { - "prompt": PROMPT_START_WRAPPER + prompt + PROMPT_END_WRAPPER, - "response": RESPONSE_START_WRAPPER + inp["email"] + RESPONSE_END_WRAPPER, - } - except Exception as e: - raise ValueError(f"Unable to extract prompt/response from {inp}") from e diff --git a/src/panza3/finetuning/preprocessing.py b/src/panza3/finetuning/preprocessing.py new file mode 100644 index 0000000..346bae2 --- /dev/null +++ b/src/panza3/finetuning/preprocessing.py @@ -0,0 +1,51 @@ +import os +from typing import Dict + +import hydra +from omegaconf import OmegaConf +from transformers import AutoConfig, AutoTokenizer + +from panza3.entities import EmailInstruction + +PREPROCESSING_CONFIG_FILE = os.environ.get("PANZA_PREPROCESSING_CONFIG") +if PREPROCESSING_CONFIG_FILE: + print("Hello from preprocessing.py") + + preprocessing_config = OmegaConf.load(PREPROCESSING_CONFIG_FILE) + prompt_builder = hydra.utils.instantiate(preprocessing_config.prompting) + + # Load tokenizer + config = AutoConfig.from_pretrained(preprocessing_config.model) + tokenizer = AutoTokenizer.from_pretrained( + preprocessing_config.model, model_max_length=config.max_position_embeddings + ) + + +def panza_preprocessing_function(inputs: Dict) -> Dict: + try: + prompt_raw = inputs["summary"].split("\n\nInstruction: ")[-1] + instruction = EmailInstruction(instruction=prompt_raw, thread=inputs.get("thread", [])) + prompt = prompt_builder.build_prompt(instruction) + + print(f"Prompt: {prompt}") + + # Generate the full conversation + conversation = [ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": inputs["email"]} + ] + chat_prompt = tokenizer.apply_chat_template(conversation, tokenize=False) + + # Identify the index where the response begins + response_begin_index = chat_prompt.index(inputs["email"]) + + # Split the full prompt into prompt and response + prompt = chat_prompt[:response_begin_index] + response = chat_prompt[response_begin_index:] + + return { + "prompt": prompt, + "response": response, + } + except Exception as e: + raise ValueError(f"Unable to extract prompt/response from {inputs}") from e diff --git a/src/panza/finetuning/train.py b/src/panza3/finetuning/train.py similarity index 97% rename from src/panza/finetuning/train.py rename to src/panza3/finetuning/train.py index 82c7c23..8f84e23 100644 --- a/src/panza/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -10,54 +10,40 @@ from typing import Any, Dict, List, Optional, Union import torch -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType, FullStateDictConfig - -from composer.optim import DecoupledAdamW - -from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy, - InContextLearningLMAccuracy, +from composer import Trainer +from composer.core.callback import Callback +from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, InContextLearningLMExpectedCalibrationError, InContextLearningMCExpectedCalibrationError, InContextLearningMultipleChoiceAccuracy, - InContextLearningQAAccuracy, - LanguageCrossEntropy, LanguagePerplexity) - -from llmfoundry.models.utils import init_empty_weights - -from transformers import PreTrainedTokenizerBase, AutoModelForCausalLM, BitsAndBytesConfig - -from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP - -from llmfoundry import ComposerHFCausalLM - -import os, sys -from peft.tuners.rosa import RosaModel, RosaScheduler, RosaConfig -from peft import get_peft_model - -from composer import Trainer -from composer.core.callback import Callback -from composer.profiler import (JSONTraceHandler, Profiler, TraceHandler, - cyclic_schedule) + InContextLearningQAAccuracy, LanguageCrossEntropy, + LanguagePerplexity) +from composer.optim import DecoupledAdamW +from composer.profiler import JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule from composer.utils import dist, get_device, reproducibility +from llmfoundry import ComposerHFCausalLM +from llmfoundry.eval.metrics.nlp import InContextLearningMetric +from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP +from llmfoundry.models.utils import init_empty_weights +from llmfoundry.utils import find_mosaicml_logger, log_train_analytics, maybe_create_mosaicml_logger from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om +from peft import get_peft_model +from peft.tuners.rosa import RosaConfig, RosaModel, RosaScheduler from rich.traceback import install - -from llmfoundry.eval.metrics.nlp import InContextLearningMetric -from llmfoundry.utils import (find_mosaicml_logger, log_train_analytics, - maybe_create_mosaicml_logger) +from torch.distributed.fsdp import FullStateDictConfig +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import StateDictType +from transformers import AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerBase install() from llmfoundry.callbacks import AsyncEval from llmfoundry.data.dataloader import build_dataloader from llmfoundry.layers_registry import ffns_with_megablocks -from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, - build_algorithm, build_callback, - build_composer_model, build_evaluators, - build_logger, build_optimizer, - build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device, +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_algorithm, build_callback, + build_composer_model, build_evaluators, build_logger, + build_optimizer, build_scheduler, build_tokenizer) +from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -330,7 +316,7 @@ def main(cfg: DictConfig) -> Trainer: hf_save_path: Union[int, str] = pop_config(cfg, 'hf_save_path', must_exist=True) - + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( cfg, 'eval_loader', must_exist=False, default_value=None) icl_tasks_config: Optional[Union[ListConfig, @@ -622,7 +608,7 @@ def main(cfg: DictConfig) -> Trainer: model = build_composer_peft_model(model_config, rosa_config, tokenizer, is_fsdp=fsdp_config is not None) if rosa_config is not None: assert isinstance(model.model.base_model, RosaModel) - + # Algorithms algorithms = [ build_algorithm(str(name), algorithm_cfg) @@ -631,10 +617,12 @@ def main(cfg: DictConfig) -> Trainer: if rosa_config is not None: algorithms.append(RosaScheduler(model.model.base_model)) - + # Dataloaders log.info('Building train loader...') try: + from datasets import disable_caching + disable_caching() train_loader = build_dataloader( train_loader_config, tokenizer, @@ -726,7 +714,7 @@ def main(cfg: DictConfig) -> Trainer: {'params': lora_params, 'lr': rosa_config['lora_lr']} ] optimizer = DecoupledAdamW(params, **optimizer_config) - + # Now add the eval metrics @@ -799,7 +787,7 @@ def main(cfg: DictConfig) -> Trainer: log.info('Starting training...') trainer.fit() - # if rosa is enabled, save the model manually, since + # if rosa is enabled, save the model manually, since # llm-foundry's checkpointing doesn't work properly with RoSA if rosa_config is not None: assert fsdp_config is None, 'fsdp is cuurently not supported with RoSA' From 48ba916901dc40a25a0348cd69c686e085592e08 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:00:47 +0200 Subject: [PATCH 035/112] comment out the running examples --- src/panza3/run_panza.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/panza3/run_panza.py b/src/panza3/run_panza.py index 4c21a62..22183a3 100644 --- a/src/panza3/run_panza.py +++ b/src/panza3/run_panza.py @@ -3,18 +3,20 @@ llm = OllamaLLM("custom", "path/to/file", {}) -messages: ChatHistoryType = [{"role": "user", "content": "Write a one-sentence email saying i will be late to the meeting"}] +messages: ChatHistoryType = [ + {"role": "user", "content": "Write a one-sentence email saying i will be late to the meeting"} +] # Example of how to use the LLM with no streaming -stream = llm.chat_stream(messages) -while True: - try: - print(next(stream)) - except StopIteration: - break - +# stream = llm.chat_stream(messages) +# while True: +# try: +# print(next(stream)) +# except StopIteration: +# break + # Example of how to use the LLM with streaming -print(llm.chat(messages)) +# print(llm.chat(messages)) # create a new PanzaWebService -service = PanzaWebService() \ No newline at end of file +service = PanzaWebService() From b662f807d06ff92551ecfbdf37d77a6a7016fbef Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:01:16 +0200 Subject: [PATCH 036/112] split dependencies into base and training and add documentation in README_panza3 for that --- README_panza3.md | 198 +++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 12 ++- 2 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 README_panza3.md diff --git a/README_panza3.md b/README_panza3.md new file mode 100644 index 0000000..0762c96 --- /dev/null +++ b/README_panza3.md @@ -0,0 +1,198 @@ +
+ panza demo +
+ +# Panza: A personal email assistant, trained and running on-device + + + +## What is Panza? + + + + +Panza is an automated email assistant customized to your writing style and past email history. \ +Its main features are as follows: +* Panza produces a fine-tuned LLM that matches your writing style, pairing it with a Retrieval-Augmented Generation (RAG) component which helps it produce relevant emails. +* Panza **can be trained and run entirely locally**. Currently, it requires a single GPU with +16-24 GiB of memory, but we also plan to release a CPU-only version. **At no point in training or execution is your data shared with the entities that trained the original LLMs, with LLM distribution services such as Huggingface, or with us.** +* Training and execution are also quick - for a dataset on the order of 1000 emails, training Panza takes well under an hour, and generating a new email takes a few seconds at most. + +
+ panza logo +
+ + +## TODO: Prerequisites +- Your emails, exported to `mbox` format (see tutorial below). +- A computer, preferably with a NVIDIA GPU with at least 24 GiB of memory (alternatively, check out [running in Google Colab](#cloud-try-out-panza-in-google-colab)). +- A Hugging Face [account](https://huggingface.co/login) to download the models (free of charge). +- [Optional] A Weights & Biases [account](https://wandb.ai/login) to log metrics during training (free of charge). +- Basic Python and Unix knowledge, such as building environments and running python scripts. +- *No prior LLMs experience is needed*. + + +## How it works + +### :film_projector: Step 1: Data playback + +For most email clients, it is possible to download a user's past emails in a machine-friendly .mbox format. For example, GMail allows you to do this via [Google Takeout](https://takeout.google.com), whereas Thunderbird allows one to do this via various plugins. + +One key part of Panza is a dataset-generation technique we call **data playback**: Given some of your past emails in .mbox format, we automatically create a training set for Panza by using a pretrained LLM to summarize the emails in instruction form; each email becomes a `(synthetic instruction, real email)` pair. +Given a dataset consisting of all pairs, we use these pairs to "play back" your sent emails: the LLM receives only the instruction, and has to generate the "ground truth" email as a training target. + +We find that this approach is very useful for the LLM to "learn" the user's writing style. + + +### :weight_lifting: Step 2: Local Fine-Tuning via Robust Adaptation (RoSA) + +We then use parameter-efficient finetuning to train the LLM on this dataset, locally. We found that we get the best results with the [RoSA method](https://arxiv.org/pdf/2401.04679.pdf), which combines low-rank (LoRA) and sparse finetuning. If parameter efficiency is not a concern, that is, you have a more powerful GPU, then regular, full-rank/full-parameter finetuning can also be used. We find that a moderate amount of further training strikes the right balance between matching the writer's style without memorizing irrelevant details in past emails. + + +### :owl: Step 3: Serving via RAG + +Once we have a custom user model, Panza can be run locally together with a Retrieval-Augmented Generation (RAG) module. Specifically, this functionality stores past emails in a database and provides a few relevant emails as context for each new query. This allows Panza to better insert specific details, such as a writer's contact information or frequently used Zoom links. + +The overall structure of Panza is as follows: +
+ panza logo +
+ +## Installation + +### Conda +1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. +2. Create a new conda environment named 'panza' (or something else) and activate it: +``` bash +conda create -n panza python=3.10 +conda activate panza +``` +3. Install the required packages: +``` bash +pip install panza_mail +``` +4. If you want to also finetune models using Panza, you will need to install the additional packages: +``` bash +pip install panza_mail[training] +``` + +## TODO: :rocket: Getting started + +To quickly get started with building your own personalized email assistant, follow the steps bellow: + + + + +### Step 0: Download your sent emails + +
+ Expand for detailed download instructions. + + We provide a description for doing this for GMail via Google Takeout. + + 1. Go to [https://takeout.google.com/](https://takeout.google.com/). + 2. Click `Deselect all`. + 3. Find `Mail` section (search for the phrase `Messages and attachments in your Gmail account in MBOX format`). + 4. Select it. + 5. Click on `All Mail data included` and deselect everything except `Sent`. + 6. Scroll to the bottom of the page and click `Next step`. + 7. Click on `Create export`. + 8. Wait for download link to arrive in your inbox. + 9. Download `Sent.mbox` and place it in the `data/` directory. + + For Outlook accounts, we suggest doing this via a Thunderbird plugin for exporting a subset of your email as an MBOX format, such as [this add-on](https://addons.thunderbird.net/en-us/thunderbird/addon/importexporttools-ng/). +
+ +At the end of this step you should have the downloaded emails placed inside `data/Sent.mbox`. + + +### Step 1: Environment configuration + + +Panza is configured through a set of environment variables defined in `scripts/config.sh` and shared along all running scripts. + + +The LLM prompt is controlled by a set of `prompt_preambles` that give the model more insight about its role, the user and how to reuse existing emails for *Retrieval-Augmented Generation (RAG)*. See more details in the [prompting section](prompt_preambles/README.md). + +:warning: Before continuing, make sure you complete the following setup: + - Modifiy the environment variable `PANZA_EMAIL_ADDRESS` inside `scripts/config.sh` with your own email address. + - Modifiy `prompt_preambles/user_preamble.txt` with your own information. If you choose, this can even be empty. + - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. + - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `PANZA_WANDB_DISABLED=False` in `scripts/config.sh`. + +You are now ready to move to `scripts`. +``` bash +cd scripts +``` + +### Step 2: Extract emails + + +1. Run `./extract_emails.sh`. This extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. + +2. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. + +### Step 3: Prepare dataset + + +1. Simply run `./prepare_dataset.sh`.
+ This scripts takes care of all the prerequisites before training (expand for details). + + - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. + - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. + - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`. +
+ +### Step 4: Train a LLM on your emails + + +We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. + +1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. +If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. + +2. We have prepopulated the training scripts with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `LR`, `LORA_LR`, `NUM_EPOCHS`. All the trained models are saved in the `checkpoints` directory. + +Examples: +``` bash +./train_rosa.sh # Will use the default parameters. + +./train_rosa.sh LR=1e-6 LORA_LR=1e-6 NUM_EPOCHS=7 # Will override LR, LORA_LR, and NUM_EPOCHS. +``` + +### Step 5: Launch Panza! + + +1. Run `./run_panza_gui.sh MODEL=` to serve the trained model in a friendly GUI. +Alternatively, if you prefer using the CLI to interact with Panza, run `./run_panza_cli.sh` instead. + +You can experiment with the following arguments: +- If `MODEL` is not specified, it will use a pretrained `Meta-Llama-3-8B-Instruct` model by default, although Panza also works with `Mistral-7B-Instruct-v2`. Try it out to compare the syle difference! +- To disable RAG, run with `PANZA_DISABLE_RAG_INFERENCE=1`. + +Example: +``` bash +./run_panza_gui.sh \ + MODEL=/local/path/to/this/repo/checkpoints/models/panza-rosa_1e-6-seed42_7908 \ + PANZA_DISABLE_RAG_INFERENCE=0 # this is the default behaviour, so you can omit it +``` + +:email: **Have fun with your new email writing assistant!** :email: + + + + +## :microscope: Advanced usage +- [Data Preparation Guide](./scripts/README.md#data-guide) +- [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) +- [Prompt Preambles Tutorial](prompt_preambles/README.md) + +## Authors + +Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): + +Dan Alistarh, Eugenia Iofinova, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, and Nir Shavit. + +Contact: dan.alistarh@ist.ac.at + +We thank our collaborators Michael Goin and Tony Wang at NeuralMagic and MIT for their helpful testing and feedback. diff --git a/pyproject.toml b/pyproject.toml index c79d5b6..a68dfdb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,16 @@ version = "2024.08.14" description = "A personal email assistant, trained and running on-device." dependencies = [ "torch==2.2.2", + "ollama", + "omegaconf", + "fastapi", + "uvicorn", + "pydantic", + "python-dotenv" +] + +[project.optional-dependencies] +training = [ "langdetect", "langchain", "langchain-community", @@ -17,11 +27,9 @@ dependencies = [ "cmake", "packaging", "nltk", - "ollama", "llm-foundry@git+https://github.com/IST-DASLab/llm-foundry", "peft@git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant_looser_versioning", "spops-sm-80", - "omegaconf", ] [build-system] From 4b2fa765efa56445046858a2cb20df518bc24c23 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Mon, 2 Sep 2024 14:32:08 +0200 Subject: [PATCH 037/112] add hydra to dependencies --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a68dfdb..e0e526e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,8 @@ dependencies = [ "fastapi", "uvicorn", "pydantic", - "python-dotenv" + "python-dotenv", + "hydra-core" ] [project.optional-dependencies] From d4bd1aa30c68973fcef55c556f95d92fe0df69a8 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 3 Sep 2024 11:41:03 +0200 Subject: [PATCH 038/112] Fix unused num_thread_emails parameter --- src/panza3/prompting/email_prompting.py | 4 +++- tests/test_prompting.py | 4 ---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/panza3/prompting/email_prompting.py b/src/panza3/prompting/email_prompting.py index ebfc590..45f01db 100644 --- a/src/panza3/prompting/email_prompting.py +++ b/src/panza3/prompting/email_prompting.py @@ -117,7 +117,9 @@ def build_prompt( rag_prompt = "" if use_thread: - thread_prompt = self._create_threading_preamble(instruction.thread).strip() + thread_prompt = self._create_threading_preamble( + instruction.thread[: self.number_thread_emails] + ).strip() else: thread_prompt = "" diff --git a/tests/test_prompting.py b/tests/test_prompting.py index bdd4eef..48e1e2e 100644 --- a/tests/test_prompting.py +++ b/tests/test_prompting.py @@ -75,8 +75,6 @@ def test_email_prompt_builder( + "E-MAIL CONTENT:\nemail2\n\n---\n\n" + "THREAD PREAMBLE:\n\n" + "email1\n\n---\n\n" - + "email2\n\n---\n\n" - + "email3\n\n---\n\n" + "Instruction: Write an email." ) @@ -87,8 +85,6 @@ def test_email_prompt_builder( + "\n\n" + "THREAD PREAMBLE:\n\n" + "email1\n\n---\n\n" - + "email2\n\n---\n\n" - + "email3\n\n---\n\n" + "Instruction: Write an email." ) From 8df8f112ee9a0ce1532451fd3c6e4e89a0f20f75 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 3 Sep 2024 11:48:37 +0200 Subject: [PATCH 039/112] Add RoSA runner --- configs/finetuning/rosa.yaml | 11 +++-- scripts/finetune.py | 90 ++++++++++++++++++++++++++---------- 2 files changed, 73 insertions(+), 28 deletions(-) diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index 20d2b4b..5c9a541 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -1,7 +1,10 @@ +defaults: + - base + rosa: lora_lr: ${lr} lora_r: 8 - spa_d: #TODO + spa_d: 0.01 lora_alpha: 16 target_modules: 'all-linear' lora_dropout: 0.05 @@ -9,7 +12,7 @@ rosa: spa_store_transpose: true rosa_dtype: bf16 spa_num_grads: 1 - grad_acc_mode: mean_squared + grad_acc_mode: mean_squared # 'mean' or 'mean_squared': how to accumulate gradients mask_load_path: #TODO mask_save_path: #TODO terminate_after_mask_generation: #TODO @@ -17,4 +20,6 @@ rosa: mask_gen_model_precision: #TODO scheduler: - t_warmup: 8ba \ No newline at end of file + t_warmup: 8ba + +num_cpu_threads: 0 \ No newline at end of file diff --git a/scripts/finetune.py b/scripts/finetune.py index fb6f51a..37f73da 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -44,8 +44,33 @@ def create_run_name(cfg: DictConfig) -> str: return run_name -def determine_rosa_schedule(cfg: DictConfig) -> str: - pass +def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: + # Disable struct mode to allow modifications + rosa_cfg = cfg.finetuning.rosa + OmegaConf.set_struct(rosa_cfg, False) + + mask_path = str(Path(cfg.checkpoint_dir) / "masks" / cfg.finetuning.run_name) + + if mask_generation: + rosa_cfg.schedule = "wl16" if rosa_cfg.lora_r != 0 else "spa_only" + rosa_cfg.mask_load_path = None + rosa_cfg.mask_save_path = mask_path + rosa_cfg.terminate_after_mask_generation = True + else: + if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "default" + elif rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "lora_only" + rosa_cfg.mask_load_path = None + else: + rosa_cfg.schedule = "spa_only" + + rosa_cfg.mask_load_path = mask_path + rosa_cfg.mask_save_path = None + rosa_cfg.terminate_after_mask_generation = None + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(rosa_cfg, True) def create_experiment_yaml() -> str: @@ -54,18 +79,18 @@ def create_experiment_yaml() -> str: def create_checkpoint_dirs(cfg: DictConfig) -> None: # Create model directory - os.makedirs(cfg.finetuning.hf_save_path) + os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) # Create mask directory if hasattr(cfg.finetuning, "rosa"): - os.makedirs(cfg.finetuning.rosa.mask_save_path) + os.makedirs(os.path.join(cfg.checkpoint_dir, "masks"), exist_ok=True) def get_hf_save_precision(cfg: DictConfig) -> str: if cfg.model_precision == "bf16": return "bfloat16" elif cfg.model_precision == "fp32": - return "fp32" + return "float32" else: raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") @@ -88,7 +113,7 @@ def override_config(cfg: DictConfig) -> None: cfg.finetuning.run_name = create_run_name(cfg) if hasattr(cfg.finetuning, "rosa"): - pass + cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) else: cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) @@ -118,20 +143,6 @@ def terminate_process_tree(pid: str): except psutil.NoSuchProcess: pass - def move_checkpoint_files(cfg: DictConfig) -> None: - # Move checkpoint files to the final directory - run_save_path = Path(cfg.hf_save_path) / "models" / cfg.run_name - huggingface_dir = run_save_path / "huggingface" - last_save_dir_name = max(huggingface_dir.iterdir(), key=os.path.getmtime).name - - # Move the contents of the last saved directory to the run save path - source_dir = huggingface_dir / last_save_dir_name - for item in source_dir.iterdir(): - shutil.move(str(item), run_save_path) - - # Remove the now-empty huggingface directory - shutil.rmtree(huggingface_dir) - train_script = os.path.join(cfg.panza_workspace, "src/panza3/finetuning/train.py") environment = os.environ.copy() environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") @@ -170,10 +181,20 @@ def move_checkpoint_files(cfg: DictConfig) -> None: torch.cuda.empty_cache() time.sleep(3) # Give some time for GPU resources to be released - if not hasattr(cfg.finetuning, "rosa"): - move_checkpoint_files(cfg) - print("Find the finetuned model at", os.path.join(cfg.hf_save_path, "models", cfg.run_name)) +def move_checkpoint_files(cfg: DictConfig) -> None: + # Move checkpoint files to the final directory + run_save_path = Path(cfg.hf_save_path) / "models" / cfg.finetuning.run_name + huggingface_dir = run_save_path / "huggingface" + last_save_dir_name = max(huggingface_dir.iterdir(), key=os.path.getmtime).name + + # Move the contents of the last saved directory to the run save path + source_dir = huggingface_dir / last_save_dir_name + for item in source_dir.iterdir(): + shutil.move(str(item), run_save_path) + + # Remove the now-empty huggingface directory + shutil.rmtree(huggingface_dir) @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_finetuning") @@ -184,13 +205,32 @@ def main(cfg: DictConfig) -> None: # Override configuration override_config(cfg) + create_checkpoint_dirs(cfg) + # Launch training + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + if "rosa" in cfg.finetuning: - pass + # Generate masks + if cfg.finetuning.rosa.spa_d != 0: + override_rosa_schedule(cfg, mask_generation=True) + finetuning_yaml = save_config_to_yaml(cfg.finetuning) + # pdb.set_trace() + launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) + # RoSA finetuning + override_rosa_schedule(cfg, mask_generation=False) + finetuning_yaml = save_config_to_yaml(cfg.finetuning) + # pdb.set_trace() + launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) else: finetuning_yaml = save_config_to_yaml(cfg.finetuning) - preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) + move_checkpoint_files(cfg) + + print( + "Find the finetuned model at", + os.path.join(cfg.hf_save_path, "models", cfg.finetuning.run_name), + ) if __name__ == "__main__": From 2c221cc2b4489289bc8574ee9666757cb8f1541b Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Tue, 3 Sep 2024 11:53:24 +0200 Subject: [PATCH 040/112] Temporarily rename serialized document in vector db metadata --- src/panza3/retriever/faiss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/panza3/retriever/faiss.py b/src/panza3/retriever/faiss.py index 466270b..fcdfa9a 100644 --- a/src/panza3/retriever/faiss.py +++ b/src/panza3/retriever/faiss.py @@ -73,7 +73,7 @@ def retrieve_with_score( # Deserialize metadata results = [ - (self.document_class.deserialize(r[0].metadata["serialized_document"]), r[1]) + (self.document_class.deserialize(r[0].metadata["serialized_email"]), r[1]) for r in results ] From e72d13cee2752e8e2e7a36d705b49aecea5f36e2 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:39:05 +0200 Subject: [PATCH 041/112] move some dependencies from training into base --- pyproject.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e0e526e..c41cb4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,16 +10,16 @@ dependencies = [ "uvicorn", "pydantic", "python-dotenv", - "hydra-core" + "hydra-core", + "langchain", + "langchain-community", + "sentence-transformers", + "faiss-cpu", ] [project.optional-dependencies] training = [ "langdetect", - "langchain", - "langchain-community", - "sentence-transformers", - "faiss-cpu", "fire", "mauve-text", "evaluate", From 7447aac0d4de2abd2f7760f1aff9615080f2fb26 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:40:07 +0200 Subject: [PATCH 042/112] add outputs folder to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 317816a..7bcaa84 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ checkpoints/ results/ wandb/ +outputs/ *.log *.egg-info From b151fc3daada235ece69939638d9ae8905822a5a Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:41:05 +0200 Subject: [PATCH 043/112] add writer to the constructor arguments of the web service --- src/panza3/hosting/web_service.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/panza3/hosting/web_service.py b/src/panza3/hosting/web_service.py index 1893bd2..f37627f 100644 --- a/src/panza3/hosting/web_service.py +++ b/src/panza3/hosting/web_service.py @@ -4,6 +4,8 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi import FastAPI, HTTPException, Header from fastapi.responses import StreamingResponse +from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.writer import PanzaWriter import uvicorn from pydantic import BaseModel from dotenv import load_dotenv @@ -17,8 +19,9 @@ class Request(BaseModel): class PanzaWebService: DEFAULT_PORT = 5001 - def __init__(self, port=DEFAULT_PORT): + def __init__(self, writer: PanzaWriter, port=DEFAULT_PORT): self.app = FastAPI() + self.writer = writer self.port = port self._setup_routes() load_dotenv() @@ -43,10 +46,9 @@ def _streamer(self, stream): yield chunk["message"]["content"] def _predict(self, input: str) -> Generator: - # TODO: Call PanzaWriter here - # Dummy generator - for i in range(10): - yield {"message": {"content": f"Generated text {i}"}} + instruction: Instruction = EmailInstruction(input) + stream: Generator = self.writer.run(instruction, stream=True) + return stream def _setup_routes(self): @self.app.options("/generate") From 77c186747dff26b14e3f803f8da4e3f5e0439afb Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:41:46 +0200 Subject: [PATCH 044/112] delete run_panza.py bc its just a test file --- src/panza3/run_panza.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 src/panza3/run_panza.py diff --git a/src/panza3/run_panza.py b/src/panza3/run_panza.py deleted file mode 100644 index 22183a3..0000000 --- a/src/panza3/run_panza.py +++ /dev/null @@ -1,22 +0,0 @@ -from hosting import PanzaWebService -from llm import OllamaLLM, ChatHistoryType - -llm = OllamaLLM("custom", "path/to/file", {}) - -messages: ChatHistoryType = [ - {"role": "user", "content": "Write a one-sentence email saying i will be late to the meeting"} -] - -# Example of how to use the LLM with no streaming -# stream = llm.chat_stream(messages) -# while True: -# try: -# print(next(stream)) -# except StopIteration: -# break - -# Example of how to use the LLM with streaming -# print(llm.chat(messages)) - -# create a new PanzaWebService -service = PanzaWebService() From fc7db3f913e8f86e2ab0e0267a3fcdb23f35583d Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:42:44 +0200 Subject: [PATCH 045/112] rename constructor argument for the Ollama LLM class to match the local LLM classes --- src/panza3/llm/ollama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index 75806a9..b7f803c 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -12,13 +12,13 @@ class OllamaLLM(LLM): - def __init__(self, name: str, gguf_file: str, sampling_params: Dict): + def __init__(self, name: str, gguf_file: str, sampling_parameters: Dict): """ Loads and serves the model from the GGUF file into Ollama with the given name and sampling parameters. """ - super().__init__(name, sampling_params) + super().__init__(name, sampling_parameters) self.gguf_file = gguf_file - self.sampling_params = sampling_params + self.sampling_params = sampling_parameters if not self._is_ollama_running(): self._start_ollama() From b6f992151aadcd3ce2ca30e6d55a25ecbb4dbecb Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:11:46 +0200 Subject: [PATCH 046/112] add none retriever to allow running without RAG --- configs/writer/prompting/retriever/none.yaml | 1 + src/panza3/retriever/__init__.py | 3 +- src/panza3/retriever/none.py | 29 ++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 configs/writer/prompting/retriever/none.yaml create mode 100644 src/panza3/retriever/none.py diff --git a/configs/writer/prompting/retriever/none.yaml b/configs/writer/prompting/retriever/none.yaml new file mode 100644 index 0000000..68be9b6 --- /dev/null +++ b/configs/writer/prompting/retriever/none.yaml @@ -0,0 +1 @@ +_target_: panza3.retriever.NoneRetriever \ No newline at end of file diff --git a/src/panza3/retriever/__init__.py b/src/panza3/retriever/__init__.py index e61fc97..780a4ca 100644 --- a/src/panza3/retriever/__init__.py +++ b/src/panza3/retriever/__init__.py @@ -1,4 +1,5 @@ from .base import DocumentRetriever from .faiss import FaissRetriever +from .none import NoneRetriever -__all__ = ["DocumentRetriever", "FaissRetriever"] +__all__ = ["DocumentRetriever", "FaissRetriever", "NoneRetriever"] diff --git a/src/panza3/retriever/none.py b/src/panza3/retriever/none.py new file mode 100644 index 0000000..f310da0 --- /dev/null +++ b/src/panza3/retriever/none.py @@ -0,0 +1,29 @@ +import logging +from typing import List, Optional, Tuple + +from ..entities.document import Document +from .base import DocumentRetriever + +LOGGER = logging.getLogger(__name__) + + +class NoneRetriever(DocumentRetriever): + def __init__( + self, + document_class: Optional[type[Document]] = None, + ) -> None: + self.document_class = document_class + + def retrieve(self, query: str, k: int, score: Optional[float] = None) -> List[Document]: + return [] + + def retrieve_with_score( + self, query: str, k: int, score: Optional[float] = None + ) -> List[Tuple[Document, float]]: + return [] + + def store(self, documents: List[Document], chunk_size: int, chunk_overlap: int): + pass + + def save_db_to_disk(self): + pass From d70f512f9021769ab0160c57d21c391a9ac7bae0 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Tue, 3 Sep 2024 16:12:24 +0200 Subject: [PATCH 047/112] add config for Ollama LLM --- configs/writer/llm/ollama.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/configs/writer/llm/ollama.yaml b/configs/writer/llm/ollama.yaml index 1afe70a..0a183b0 100644 --- a/configs/writer/llm/ollama.yaml +++ b/configs/writer/llm/ollama.yaml @@ -1 +1,6 @@ -# TODO: Add Ollama config \ No newline at end of file +defaults: + - sampling: random + +_target_: panza3.llm.OllamaLLM +name: "custom" +gguf_file: "custom.gguf" From 01b405270aff89c2451e326492f319569f507ab6 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:49:07 +0200 Subject: [PATCH 048/112] remove DEFAULT_PORT and add integer type hint --- src/panza3/hosting/web_service.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/panza3/hosting/web_service.py b/src/panza3/hosting/web_service.py index f37627f..1a39976 100644 --- a/src/panza3/hosting/web_service.py +++ b/src/panza3/hosting/web_service.py @@ -17,9 +17,7 @@ class Request(BaseModel): class PanzaWebService: - DEFAULT_PORT = 5001 - - def __init__(self, writer: PanzaWriter, port=DEFAULT_PORT): + def __init__(self, writer: PanzaWriter, port: int): self.app = FastAPI() self.writer = writer self.port = port From 1d05864d9b8fd9ff1eeaaf07053cfdc5121ae19a Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:59:58 +0200 Subject: [PATCH 049/112] add interfaces --- configs/interfaces/cli.yaml | 1 + configs/interfaces/gui.yaml | 1 + configs/interfaces/web.yaml | 2 ++ configs/panza_writer.yaml | 4 ++- pyproject.toml | 1 + scripts/runner.py | 8 ++--- src/panza3/hosting/__init__.py | 3 -- src/panza3/interface/__init__.py | 5 ++++ src/panza3/interface/cli.py | 15 ++++++++++ src/panza3/interface/gui.py | 30 +++++++++++++++++++ .../web_service.py => interface/web.py} | 17 ++++++++++- 11 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 configs/interfaces/cli.yaml create mode 100644 configs/interfaces/gui.yaml create mode 100644 configs/interfaces/web.yaml delete mode 100644 src/panza3/hosting/__init__.py create mode 100644 src/panza3/interface/__init__.py create mode 100644 src/panza3/interface/cli.py create mode 100644 src/panza3/interface/gui.py rename src/panza3/{hosting/web_service.py => interface/web.py} (80%) diff --git a/configs/interfaces/cli.yaml b/configs/interfaces/cli.yaml new file mode 100644 index 0000000..1b69409 --- /dev/null +++ b/configs/interfaces/cli.yaml @@ -0,0 +1 @@ +_target_: panza3.interface.PanzaCLI \ No newline at end of file diff --git a/configs/interfaces/gui.yaml b/configs/interfaces/gui.yaml new file mode 100644 index 0000000..49b5515 --- /dev/null +++ b/configs/interfaces/gui.yaml @@ -0,0 +1 @@ +_target_: panza3.interface.PanzaGUI \ No newline at end of file diff --git a/configs/interfaces/web.yaml b/configs/interfaces/web.yaml new file mode 100644 index 0000000..0ac3c70 --- /dev/null +++ b/configs/interfaces/web.yaml @@ -0,0 +1,2 @@ +port: 5001 +_target_: panza3.interface.PanzaWebService \ No newline at end of file diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml index 13fede3..3efc698 100644 --- a/configs/panza_writer.yaml +++ b/configs/panza_writer.yaml @@ -1,6 +1,8 @@ defaults: - base - writer: email - - user: default + - user: seanyang711 + - interfaces: + - gui checkpoint: "microsoft/Phi-3-mini-4k-instruct" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c41cb4a..fec5c2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "langchain-community", "sentence-transformers", "faiss-cpu", + "gradio", ] [project.optional-dependencies] diff --git a/scripts/runner.py b/scripts/runner.py index 27ee94e..9c860b5 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -35,12 +35,8 @@ def main(cfg: DictConfig) -> None: writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter" - # TODO: Connect to CLI / GUI / webserver, etc. - output, prompt = writer.run( - instruction=EmailInstruction(instruction="Write an email."), return_prompt=True - ) - print("Prompt:", prompt) - print("Output:", output) + # Instantiate interfaces (CLI, GUI, web, etc) as specified in the configuration + hydra.utils.instantiate(cfg.interfaces, writer=writer) if __name__ == "__main__": diff --git a/src/panza3/hosting/__init__.py b/src/panza3/hosting/__init__.py deleted file mode 100644 index b5367f4..0000000 --- a/src/panza3/hosting/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .web_service import PanzaWebService - -__all__ = ["PanzaWebService"] \ No newline at end of file diff --git a/src/panza3/interface/__init__.py b/src/panza3/interface/__init__.py new file mode 100644 index 0000000..9448a5f --- /dev/null +++ b/src/panza3/interface/__init__.py @@ -0,0 +1,5 @@ +from .web import PanzaWebService +from .cli import PanzaCLI +from .gui import PanzaGUI + +__all__ = ["PanzaWebService", "PanzaCLI", "PanzaGUI"] diff --git a/src/panza3/interface/cli.py b/src/panza3/interface/cli.py new file mode 100644 index 0000000..6ad11b5 --- /dev/null +++ b/src/panza3/interface/cli.py @@ -0,0 +1,15 @@ +from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.writer import PanzaWriter + + +class PanzaCLI: + def __init__(self, writer: PanzaWriter, **kwargs): + self.writer = writer + while True: + user_input = input("Enter a command: ") + if user_input == "exit": + break + else: + instruction: Instruction = EmailInstruction(user_input) + response = self.writer.run(instruction) + print(response) diff --git a/src/panza3/interface/gui.py b/src/panza3/interface/gui.py new file mode 100644 index 0000000..f48d97d --- /dev/null +++ b/src/panza3/interface/gui.py @@ -0,0 +1,30 @@ +from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.writer import PanzaWriter +import gradio as gr + + +class PanzaGUI: + def __init__(self, writer: PanzaWriter, **kwargs): + self.writer = writer + with gr.Blocks() as panza: + gr.Markdown("# Panza\n") + inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") + outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model") + inputbox.submit( + self.get_execute(), + [inputbox], + [outputbox], + ) + + panza.queue().launch(server_name="localhost", server_port=5002, share=True) + + def get_execute(self): + def execute(input): + instruction: Instruction = EmailInstruction(input) + stream = self.writer.run(instruction, stream=True) + output = "" + for chunk in stream: + output += chunk + yield output + + return execute diff --git a/src/panza3/hosting/web_service.py b/src/panza3/interface/web.py similarity index 80% rename from src/panza3/hosting/web_service.py rename to src/panza3/interface/web.py index 1a39976..26065ba 100644 --- a/src/panza3/hosting/web_service.py +++ b/src/panza3/interface/web.py @@ -26,6 +26,7 @@ def __init__(self, writer: PanzaWriter, port: int): self._add_cors() self.api_keys = self._get_valid_api_keys() self._start_server() + self.server_thread = None def _add_cors(self): self.app.add_middleware( @@ -61,4 +62,18 @@ def generate_text(request: Request, x_api_key: Annotated[str | None, Header()]): return StreamingResponse(self._streamer(stream), media_type="text/event-stream") def _start_server(self): - uvicorn.run(self.app, port=self.port) + self.server_thread = threading.Thread( + target=uvicorn.run, + args=(self.app,), + kwargs={"port": self.port}, + daemon=False, + ) + self.server_thread.start() + print("Panza web server started.") + + def _stop_server(self): + if self.server_thread is None: + return + self.server_thread.join() + self.server_thread = None + print("Panza web server stopped.") From a5227682d0d1f4f18df3fec15e468bbc3261fbda Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 5 Sep 2024 13:00:08 +0200 Subject: [PATCH 050/112] add hydra yaml overrides to training script (works for full training only atm) --- src/panza3/finetuning/train.py | 179 ++++++++++++++++++++++++++++++--- 1 file changed, 167 insertions(+), 12 deletions(-) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 8f84e23..857f2dc 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -4,9 +4,12 @@ import gc import logging import os +import random import sys +import tempfile import time import warnings +from pathlib import Path from typing import Any, Dict, List, Optional, Union import torch @@ -21,6 +24,7 @@ from composer.optim import DecoupledAdamW from composer.profiler import JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule from composer.utils import dist, get_device, reproducibility +from datasets import disable_caching from llmfoundry import ComposerHFCausalLM from llmfoundry.eval.metrics.nlp import InContextLearningMetric from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP @@ -47,6 +51,11 @@ update_batch_size_info) from llmfoundry.utils.registry_utils import import_file +import hydra +from omegaconf import DictConfig, OmegaConf + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers + log = logging.getLogger(__name__) @@ -124,6 +133,114 @@ def validate_config(cfg: DictConfig): ) +def create_run_name(cfg: DictConfig) -> str: + # export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR} + + run_name = f"panza_{cfg.user.username}" + + model_name = cfg.model.split("/")[-1] + run_name += f"-{model_name}" + + run_name += f"-{cfg.model_precision}" + run_name += f"-bs{cfg.batch_size}" + + if hasattr(cfg.finetuning, "rosa"): + run_name += "-rosa" + else: + run_name += "-fft" + + run_name += f"-lr{cfg.lr}" + run_name += f"-epochs{cfg.num_epochs}" + run_name += f"-seed{cfg.seed}" + + return run_name + + +def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: + # Disable struct mode to allow modifications + rosa_cfg = cfg.finetuning.rosa + OmegaConf.set_struct(rosa_cfg, False) + + mask_path = str(Path(cfg.checkpoint_dir) / "masks" / cfg.finetuning.run_name) + + if mask_generation: + rosa_cfg.schedule = "wl16" if rosa_cfg.lora_r != 0 else "spa_only" + rosa_cfg.mask_load_path = None + rosa_cfg.mask_save_path = mask_path + rosa_cfg.terminate_after_mask_generation = True + else: + if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "default" + elif rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "lora_only" + rosa_cfg.mask_load_path = None + else: + rosa_cfg.schedule = "spa_only" + + rosa_cfg.mask_load_path = mask_path + rosa_cfg.mask_save_path = None + rosa_cfg.terminate_after_mask_generation = None + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(rosa_cfg, True) + + +def create_experiment_yaml() -> str: + pass + + +def create_checkpoint_dirs(cfg: DictConfig) -> None: + # Create model directory + os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) + + # Create mask directory + if hasattr(cfg.finetuning, "rosa"): + os.makedirs(os.path.join(cfg.checkpoint_dir, "masks"), exist_ok=True) + + +def get_hf_save_precision(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bfloat16" + elif cfg.model_precision == "fp32": + return "float32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def get_rosa_dtype(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bg16" + elif cfg.model_precision == "fp32": + return "fp32" + elif cfg.model_precision == "4bit": + return "fp32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def override_config(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + if not cfg.finetuning.run_name: + cfg.finetuning.run_name = create_run_name(cfg) + + if hasattr(cfg.finetuning, "rosa"): + cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) + else: + cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +def save_config_to_yaml(cfg: DictConfig) -> str: + cfg = OmegaConf.to_container(cfg, resolve=True) + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yaml") as temp_file: + OmegaConf.save(config=cfg, f=temp_file.name) + return temp_file.name + + def build_composer_peft_model( model_config: str, rosa_config: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, is_fsdp: bool = False) -> ComposerHFCausalLM: @@ -214,7 +331,30 @@ def build_composer_peft_model( # model = ModelComposerHFCausalLM(model, tokenizer) return model +@hydra.main(version_base="1.1", config_path="../../../configs", config_name="panza_finetuning") def main(cfg: DictConfig) -> Trainer: + override_config(cfg) + + #raise ValueError(cfg) + # The preprocessing config is saved to a temporary directory + # and accessed through an environment variable. Note that this + # happens separately for each process (however, a collision should) + # not be a problem, since the configs are the same. + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + + #create_checkpoint_dirs(cfg) + environment = os.environ + # I don't think we need this, since panza is loaded with pip. + #environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") + environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" + environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) + environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml + + cfg = cfg.finetuning + + # Make the config editable for popping. + OmegaConf.set_struct(cfg, False) + # Run user provided code if specified code_paths = pop_config(cfg, 'code_paths', @@ -538,6 +678,7 @@ def main(cfg: DictConfig) -> Trainer: log.info('Building tokenizer...') tokenizer_name = tokenizer_config['name'] tokenizer_kwargs = tokenizer_config.get('kwargs', {}) + tokenizer_kwargs["num_proc"] = 1 tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # Scheduler @@ -621,7 +762,6 @@ def main(cfg: DictConfig) -> Trainer: # Dataloaders log.info('Building train loader...') try: - from datasets import disable_caching disable_caching() train_loader = build_dataloader( train_loader_config, @@ -822,17 +962,32 @@ def main(cfg: DictConfig) -> Trainer: return trainer + +def do_thing(cfg:DictConfig) -> List[str]: + # Override configuration + override_config(cfg) + + create_checkpoint_dirs(cfg) + + # Launch training + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + finetuning_yaml = save_config_to_yaml(cfg.finetuning) + print(preprocessing_yaml, finetuning_yaml) + + if __name__ == '__main__': - yaml_path, args_list = sys.argv[1], sys.argv[2:] + ARGS_LIST = sys.argv[1:] + # TODO: do we need this? # Disable resolving environment variables through omegaconf. - om.clear_resolver('oc.env') - - # Load yaml and cli arguments. - with open(yaml_path) as f: - yaml_cfg = om.load(f) - cli_cfg = om.from_cli(args_list) - cfg = om.merge(yaml_cfg, cli_cfg) - om.resolve(cfg) - assert isinstance(cfg, DictConfig) - main(cfg) + # om.clear_resolver('oc.env') + + # #log.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + # # Load yaml and cli arguments. + # with open(yaml_path) as f: + # yaml_cfg = om.load(f) + # cli_cfg = om.from_cli(args_list) + # cfg = om.merge(yaml_cfg, cli_cfg) + # om.resolve(cfg) + # assert isinstance(cfg, DictConfig) + main() From 3a2c70b2ce6619bb8fa7e0696caa14b68217f8c0 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:58:03 +0200 Subject: [PATCH 051/112] remove my user from configs and add comments to show how to enable other interfaces --- configs/panza_writer.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml index 3efc698..b9fd668 100644 --- a/configs/panza_writer.yaml +++ b/configs/panza_writer.yaml @@ -1,8 +1,10 @@ defaults: - base - writer: email - - user: seanyang711 + - user: default - interfaces: - gui + # - cli + # - web checkpoint: "microsoft/Phi-3-mini-4k-instruct" \ No newline at end of file From 3801351501fe2950ace2904f3dd765062533d435 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:23:45 +0200 Subject: [PATCH 052/112] add temporary inference instructions --- TEMP_HOW_TO_RUN_INFERENCE.md | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 TEMP_HOW_TO_RUN_INFERENCE.md diff --git a/TEMP_HOW_TO_RUN_INFERENCE.md b/TEMP_HOW_TO_RUN_INFERENCE.md new file mode 100644 index 0000000..b7b0f35 --- /dev/null +++ b/TEMP_HOW_TO_RUN_INFERENCE.md @@ -0,0 +1,44 @@ +# How to run inference in Panza3 + +There are two backend options: Ollama (no GPU) or Local (with GPU). The dependencies necessary for each backend are different. + +## Step 1: Install Dependencies for Panza + +For Ollama, simply run: +```bash +pip install -e . +``` + +For Local, run: +```bash +pip install -e . +``` +and +```bash +pip install panza_mail[training] +``` + +## Step 2a: Ollama Prerequisites + +If running with Ollama, then Ollama needs to be installed from the [web page](https://ollama.com/). + +Then, you will need to convert your model into a GGUF file. + +## Step 2b: Local Prerequisites + +If running locally, then the Panza model needs to be located in `data`. + +## Step 3: Set configurations + +In the `configs folder` add a user YAML file for yourself in `/user`. + +Then, in `/writer/email.yaml`, change the `llm` field to the backend of your choice. Either `ollama` or `peft` (local) or `transformers` (local). + +If running with Ollama, edit the `name` and `gguf` fields in `/writer/llm/ollama.yaml` with a name of your choice and the path to the GGUF file. + +## Step 4: Run Panza + +To run Panza, cd into the `scripts` directory and run: +```bash +python3 runner.py interfaces= +``` \ No newline at end of file From f1bf461a3666ed076cb1a30da0598ebb57621a15 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:28:41 +0200 Subject: [PATCH 053/112] use command line config specifications in the inference instructions --- TEMP_HOW_TO_RUN_INFERENCE.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/TEMP_HOW_TO_RUN_INFERENCE.md b/TEMP_HOW_TO_RUN_INFERENCE.md index b7b0f35..6e8c3a6 100644 --- a/TEMP_HOW_TO_RUN_INFERENCE.md +++ b/TEMP_HOW_TO_RUN_INFERENCE.md @@ -32,13 +32,11 @@ If running locally, then the Panza model needs to be located in `data`. In the `configs folder` add a user YAML file for yourself in `/user`. -Then, in `/writer/email.yaml`, change the `llm` field to the backend of your choice. Either `ollama` or `peft` (local) or `transformers` (local). - If running with Ollama, edit the `name` and `gguf` fields in `/writer/llm/ollama.yaml` with a name of your choice and the path to the GGUF file. ## Step 4: Run Panza To run Panza, cd into the `scripts` directory and run: ```bash -python3 runner.py interfaces= +python3 runner.py user= interfaces= writer/llm= ``` \ No newline at end of file From 9c821b3895dad770f9c49a95103584177d83ba1f Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:29:36 +0200 Subject: [PATCH 054/112] add example to inference instructions --- TEMP_HOW_TO_RUN_INFERENCE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TEMP_HOW_TO_RUN_INFERENCE.md b/TEMP_HOW_TO_RUN_INFERENCE.md index 6e8c3a6..da42f9d 100644 --- a/TEMP_HOW_TO_RUN_INFERENCE.md +++ b/TEMP_HOW_TO_RUN_INFERENCE.md @@ -39,4 +39,8 @@ If running with Ollama, edit the `name` and `gguf` fields in `/writer/llm/ollama To run Panza, cd into the `scripts` directory and run: ```bash python3 runner.py user= interfaces= writer/llm= +``` +For example, to run with Ollama and the CLI interface with the user `test`, run: +```bash +python3 runner.py user=test interfaces=cli writer/llm=ollama ``` \ No newline at end of file From 46f222d568ffb9a6471bc2af68971f16515f1681 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 6 Sep 2024 12:28:15 +0200 Subject: [PATCH 055/112] update training script for RoSA --- configs/finetuning/rosa.yaml | 2 +- src/panza3/finetuning/train.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index 5c9a541..d87a7b3 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -17,7 +17,7 @@ rosa: mask_save_path: #TODO terminate_after_mask_generation: #TODO schedule: #TODO - mask_gen_model_precision: #TODO + masks_only: true scheduler: t_warmup: 8ba diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 857f2dc..9b30d6f 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -168,8 +168,9 @@ def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: rosa_cfg.mask_load_path = None rosa_cfg.mask_save_path = mask_path rosa_cfg.terminate_after_mask_generation = True + rosa_cfg.mask_gen_model_precision = 'amp_bf16' else: - if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: + if rosa_cfg.spa_d > 0 and rosa_cfg.lora_r != 0: rosa_cfg.schedule = "default" elif rosa_cfg.lora_r != 0: rosa_cfg.schedule = "lora_only" @@ -209,7 +210,7 @@ def get_hf_save_precision(cfg: DictConfig) -> str: def get_rosa_dtype(cfg: DictConfig) -> str: if cfg.model_precision == "bf16": - return "bg16" + return "bf16" elif cfg.model_precision == "fp32": return "fp32" elif cfg.model_precision == "4bit": @@ -227,6 +228,8 @@ def override_config(cfg: DictConfig) -> None: if hasattr(cfg.finetuning, "rosa"): cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) + if cfg.finetuning.rosa.spa_d != 0: + override_rosa_schedule(cfg, mask_generation=cfg.finetuning.rosa.masks_only) else: cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) @@ -301,6 +304,7 @@ def build_composer_peft_model( bias="none", task_type="CAUSAL_LM", ) + #raise ValueError(config) print('Adding RoSA modules...') model = get_peft_model(model, config) print('RoSA modules added!') @@ -335,7 +339,7 @@ def build_composer_peft_model( def main(cfg: DictConfig) -> Trainer: override_config(cfg) - #raise ValueError(cfg) + #!!! # The preprocessing config is saved to a temporary directory # and accessed through an environment variable. Note that this # happens separately for each process (however, a collision should) @@ -351,6 +355,7 @@ def main(cfg: DictConfig) -> Trainer: environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml cfg = cfg.finetuning + #raise ValueError(cfg) # Make the config editable for popping. OmegaConf.set_struct(cfg, False) @@ -874,6 +879,10 @@ def main(cfg: DictConfig) -> Trainer: # Build the Trainer log.info('Building trainer...') + dtypes = {x.dtype for x in model.parameters()} + print(dtypes) + #raise ValueError(dtypes) + #raise ValueError(model.dtype) trainer = Trainer( run_name=run_name, seed=seed, From 62d3b27a8fbede770a330a80b4abce5598e5c828 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 6 Sep 2024 14:43:16 +0200 Subject: [PATCH 056/112] remove redundant, unnecessary and problematic use_rag and use_thread arguments from prompt building --- src/panza3/prompting/email_prompting.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/panza3/prompting/email_prompting.py b/src/panza3/prompting/email_prompting.py index 45f01db..3a0e227 100644 --- a/src/panza3/prompting/email_prompting.py +++ b/src/panza3/prompting/email_prompting.py @@ -98,17 +98,15 @@ def load_all_preambles( def build_prompt( self, instruction: EmailInstruction, - use_rag: bool = True, - use_thread: bool = True, ) -> str: - if use_rag and not self.rag_preamble: + if self.number_thread_emails and not self.rag_preamble: raise ValueError("RAG preamble format must be provided if RAG is used.") - if use_thread and not self.thread_preamble: + if self.number_thread_emails and not self.thread_preamble: raise ValueError("Thread preamble format must be provided if thread is used.") - if use_rag: + if self.number_rag_emails > 0: relevant_emails = self.retriever.retrieve( instruction.instruction, self.number_rag_emails, self.rag_relevance_threshold ) @@ -116,7 +114,7 @@ def build_prompt( else: rag_prompt = "" - if use_thread: + if self.number_thread_emails > 0: thread_prompt = self._create_threading_preamble( instruction.thread[: self.number_thread_emails] ).strip() From bd42248d0a4b6d5082ac9891ed861bb43f1c5b11 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 6 Sep 2024 15:12:44 +0200 Subject: [PATCH 057/112] minor training cleanups --- src/panza3/finetuning/train.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 9b30d6f..415f52b 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -1,11 +1,10 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 + import copy import gc import logging import os -import random -import sys import tempfile import time import warnings @@ -339,7 +338,6 @@ def build_composer_peft_model( def main(cfg: DictConfig) -> Trainer: override_config(cfg) - #!!! # The preprocessing config is saved to a temporary directory # and accessed through an environment variable. Note that this # happens separately for each process (however, a collision should) @@ -985,18 +983,4 @@ def do_thing(cfg:DictConfig) -> List[str]: if __name__ == '__main__': - ARGS_LIST = sys.argv[1:] - - # TODO: do we need this? - # Disable resolving environment variables through omegaconf. - # om.clear_resolver('oc.env') - - # #log.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) - # # Load yaml and cli arguments. - # with open(yaml_path) as f: - # yaml_cfg = om.load(f) - # cli_cfg = om.from_cli(args_list) - # cfg = om.merge(yaml_cfg, cli_cfg) - # om.resolve(cfg) - # assert isinstance(cfg, DictConfig) main() From 7e8f3430723bafce0ef717755d18055295c2a111 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 10 Sep 2024 10:35:10 +0200 Subject: [PATCH 058/112] add config for peft writer --- configs/writer/llm/peft.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/configs/writer/llm/peft.yaml b/configs/writer/llm/peft.yaml index 6933876..8a97b5e 100644 --- a/configs/writer/llm/peft.yaml +++ b/configs/writer/llm/peft.yaml @@ -1 +1,9 @@ -# TODO: Add PEFT config \ No newline at end of file +defaults: + - sampling: random + +_target_: panza3.llm.PeftLLM +name: ${checkpoint} +checkpoint: ${checkpoint} +device: "cpu" # Alternatively, "cuda" +dtype: "fp32" +load_in_4bit: false From 090c8f18e3d40710397d526a56390c4f178c4038 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Wed, 11 Sep 2024 11:06:23 +0200 Subject: [PATCH 059/112] deprecate panza_finetuning.yaml --- configs/base.yaml | 10 ++++++++++ configs/finetuning/base.yaml | 6 ++---- configs/finetuning/full.yaml | 8 ++++++++ configs/finetuning/rosa.yaml | 9 ++++++++- configs/panza_finetuning.yaml | 18 ------------------ src/panza3/finetuning/train.py | 30 ++++++++++-------------------- 6 files changed, 38 insertions(+), 43 deletions(-) delete mode 100644 configs/panza_finetuning.yaml diff --git a/configs/base.yaml b/configs/base.yaml index 0fb1a10..e73a643 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,7 +1,17 @@ +defaults: + - user: default + - finetuning: full + - writer/prompting/email_prompting@preprocessing.prompting + panza_workspace: ${hydra:runtime.cwd}/../ checkpoint_dir: ${panza_workspace}/checkpoints seed: 42 +model: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" embedding_model: "sentence-transformers/all-mpnet-base-v2" +preprocessing: + model: ${model} + + wandb_disabled: false \ No newline at end of file diff --git a/configs/finetuning/base.yaml b/configs/finetuning/base.yaml index d62ad00..bb548ff 100644 --- a/configs/finetuning/base.yaml +++ b/configs/finetuning/base.yaml @@ -6,11 +6,9 @@ load_path: # set via bash script to be absolute path to your sparse checkpoint precision: amp_bf16 hf_save_path: ${checkpoint_dir}/models -max_duration: ${num_epochs}ep eval_interval: 1 -seed: ${seed} -global_train_batch_size: ${batch_size} +global_train_batch_size: 8 device_train_microbatch_size: 1 device_eval_batch_size: 1 @@ -56,7 +54,7 @@ scheduler: optimizer: name: decoupled_adamw - lr: ${lr} + lr: 1e-5 betas: - 0.9 - 0.999 diff --git a/configs/finetuning/full.yaml b/configs/finetuning/full.yaml index 27e90bb..493bde5 100644 --- a/configs/finetuning/full.yaml +++ b/configs/finetuning/full.yaml @@ -1,6 +1,14 @@ defaults: - base + +max_duration: 3ep +lr: 1e-5 +batch_size: 8 +eval_interval: 1 +model_precision: bf16 # bf16 or fp32 +seed: 42 + fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: FULL diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index d87a7b3..40f53d6 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -1,8 +1,15 @@ defaults: - base +max_duration: 5ep +lr: 1e-5 +batch_size: 8 +eval_interval: 1 +model_precision: bf16 # bf16 or fp32 +seed: 42 + rosa: - lora_lr: ${lr} + lora_lr: ${finetuning.lr} lora_r: 8 spa_d: 0.01 lora_alpha: 16 diff --git a/configs/panza_finetuning.yaml b/configs/panza_finetuning.yaml deleted file mode 100644 index 0e05413..0000000 --- a/configs/panza_finetuning.yaml +++ /dev/null @@ -1,18 +0,0 @@ -defaults: - - base - - user: default - - finetuning: full - - writer/prompting/email_prompting@preprocessing.prompting - - -model: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" -num_epochs: 1 -lr: 1e-4 -batch_size: 8 -model_precision: bf16 # bf16 or fp32 - -preprocessing: - model: ${model} - prompting: - number_rag_emails: 0 - number_thread_emails: 0 \ No newline at end of file diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 415f52b..bb735ba 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -140,17 +140,17 @@ def create_run_name(cfg: DictConfig) -> str: model_name = cfg.model.split("/")[-1] run_name += f"-{model_name}" - run_name += f"-{cfg.model_precision}" - run_name += f"-bs{cfg.batch_size}" + run_name += f"-{cfg.finetuning.model_precision}" + run_name += f"-bs{cfg.finetuning.batch_size}" if hasattr(cfg.finetuning, "rosa"): run_name += "-rosa" else: run_name += "-fft" - run_name += f"-lr{cfg.lr}" - run_name += f"-epochs{cfg.num_epochs}" - run_name += f"-seed{cfg.seed}" + run_name += f"-lr{cfg.finetuning.lr}" + run_name += f"-{cfg.finetuning.max_duration}" + run_name += f"-seed{cfg.finetuning.seed}" return run_name @@ -334,10 +334,13 @@ def build_composer_peft_model( # model = ModelComposerHFCausalLM(model, tokenizer) return model -@hydra.main(version_base="1.1", config_path="../../../configs", config_name="panza_finetuning") +@hydra.main(version_base="1.1", config_path="../../../configs", config_name="base") def main(cfg: DictConfig) -> Trainer: override_config(cfg) + # Resolve all interpolation variables as early as possible + om.resolve(cfg) + # The preprocessing config is saved to a temporary directory # and accessed through an environment variable. Note that this # happens separately for each process (however, a collision should) @@ -881,6 +884,7 @@ def main(cfg: DictConfig) -> Trainer: print(dtypes) #raise ValueError(dtypes) #raise ValueError(model.dtype) + #raise ValueError([save_folder, save_overwrite, save_filename, save_latest_filename, save_interval, save_overwrite]) trainer = Trainer( run_name=run_name, seed=seed, @@ -968,19 +972,5 @@ def main(cfg: DictConfig) -> Trainer: log.info('Done.') return trainer - - -def do_thing(cfg:DictConfig) -> List[str]: - # Override configuration - override_config(cfg) - - create_checkpoint_dirs(cfg) - - # Launch training - preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) - finetuning_yaml = save_config_to_yaml(cfg.finetuning) - print(preprocessing_yaml, finetuning_yaml) - - if __name__ == '__main__': main() From aa6a58696009904070d280ae44bf4bc94aaf8e7e Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Wed, 11 Sep 2024 13:19:25 +0200 Subject: [PATCH 060/112] small config fixes --- configs/base.yaml | 1 + configs/finetuning/full.yaml | 1 - configs/finetuning/rosa.yaml | 1 - configs/panza_writer.yaml | 3 +-- src/panza3/finetuning/train.py | 3 +-- 5 files changed, 3 insertions(+), 6 deletions(-) diff --git a/configs/base.yaml b/configs/base.yaml index e73a643..8a6012c 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -9,6 +9,7 @@ seed: 42 model: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" embedding_model: "sentence-transformers/all-mpnet-base-v2" +model_precision: bf16 # bf16 or fp32 preprocessing: model: ${model} diff --git a/configs/finetuning/full.yaml b/configs/finetuning/full.yaml index 493bde5..92e939c 100644 --- a/configs/finetuning/full.yaml +++ b/configs/finetuning/full.yaml @@ -6,7 +6,6 @@ max_duration: 3ep lr: 1e-5 batch_size: 8 eval_interval: 1 -model_precision: bf16 # bf16 or fp32 seed: 42 fsdp_config: diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index 40f53d6..ac27984 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -5,7 +5,6 @@ max_duration: 5ep lr: 1e-5 batch_size: 8 eval_interval: 1 -model_precision: bf16 # bf16 or fp32 seed: 42 rosa: diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml index b9fd668..62434f5 100644 --- a/configs/panza_writer.yaml +++ b/configs/panza_writer.yaml @@ -1,10 +1,9 @@ defaults: - base - writer: email - - user: default - interfaces: - gui # - cli # - web -checkpoint: "microsoft/Phi-3-mini-4k-instruct" \ No newline at end of file +checkpoint: '/nfs/scistore19/alistgrp/eiofinov/PanzaMail/data/jen_model' \ No newline at end of file diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index bb735ba..8c0630d 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -140,7 +140,7 @@ def create_run_name(cfg: DictConfig) -> str: model_name = cfg.model.split("/")[-1] run_name += f"-{model_name}" - run_name += f"-{cfg.finetuning.model_precision}" + run_name += f"-{cfg.model_precision}" run_name += f"-bs{cfg.finetuning.batch_size}" if hasattr(cfg.finetuning, "rosa"): @@ -356,7 +356,6 @@ def main(cfg: DictConfig) -> Trainer: environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml cfg = cfg.finetuning - #raise ValueError(cfg) # Make the config editable for popping. OmegaConf.set_struct(cfg, False) From 19e5b54844783c7f3ec5c4e165c45259f8b43344 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Wed, 11 Sep 2024 13:49:12 +0200 Subject: [PATCH 061/112] Refactor data summarization --- configs/panza_preparation.yaml | 10 ++ .../prompting/summarization_prompting.yaml | 3 + configs/writer/summary.yaml | 5 + .../summarization_prompt.txt | 0 scripts/prepare_data.py | 119 ++++++++++++++++++ src/panza3/entities/__init__.py | 4 +- src/panza3/entities/instruction.py | 5 + src/panza3/prompting/__init__.py | 3 +- .../prompting/summarization_prompting.py | 19 +++ src/panza3/retriever/faiss.py | 1 + 10 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 configs/panza_preparation.yaml create mode 100644 configs/writer/prompting/summarization_prompting.yaml create mode 100644 configs/writer/summary.yaml rename {src/panza/data_preparation => prompt_preambles}/summarization_prompt.txt (100%) create mode 100644 scripts/prepare_data.py create mode 100644 src/panza3/prompting/summarization_prompting.py diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml new file mode 100644 index 0000000..54b001b --- /dev/null +++ b/configs/panza_preparation.yaml @@ -0,0 +1,10 @@ +defaults: + - base + - user: armand + - writer: summary + - writer/prompting/retriever/faiss@retriever + +batch_size: 8 +data_path: ${user.data_dir}/${user.username}_clean.jsonl +checkpoint: "microsoft/Phi-3-mini-4k-instruct" +force: false \ No newline at end of file diff --git a/configs/writer/prompting/summarization_prompting.yaml b/configs/writer/prompting/summarization_prompting.yaml new file mode 100644 index 0000000..6e44871 --- /dev/null +++ b/configs/writer/prompting/summarization_prompting.yaml @@ -0,0 +1,3 @@ +_target_: panza3.prompting.SummarizationPromptBuilder + +summarization_prompt: ${load_preamble:${panza_workspace}/prompt_preambles/summarization_prompt.txt} diff --git a/configs/writer/summary.yaml b/configs/writer/summary.yaml new file mode 100644 index 0000000..76d8f83 --- /dev/null +++ b/configs/writer/summary.yaml @@ -0,0 +1,5 @@ +defaults: + - llm: transformers + - prompting: summarization_prompting + +_target_: panza3.writer.PanzaWriter diff --git a/src/panza/data_preparation/summarization_prompt.txt b/prompt_preambles/summarization_prompt.txt similarity index 100% rename from src/panza/data_preparation/summarization_prompt.txt rename to prompt_preambles/summarization_prompt.txt diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py new file mode 100644 index 0000000..8674660 --- /dev/null +++ b/scripts/prepare_data.py @@ -0,0 +1,119 @@ +import json +import logging +import os +import sys +import time +from typing import List + +import hydra +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers +from panza3.entities import Document, Email, SummarizationInstruction +from panza3.retriever import DocumentRetriever + +LOGGER = logging.getLogger(__name__) + + +def rename_config_keys(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + cfg.writer.llm.sampling_parameters = cfg.writer.llm.sampling + del cfg.writer.llm.sampling + + cfg.writer.prompt_builder = cfg.writer.prompting + del cfg.writer.prompting + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +def load_documents(data_path: str) -> None: + assert data_path.endswith(".jsonl"), f"Expecting a .jsonl file, but given = {data_path}" + + LOGGER.info(f"--> Reading emails from: {data_path}") + + with open(data_path, "r") as f: + lines = f.readlines() + documents = [Email.deserialize(line.strip(",")) for line in lines] + print(f"--> # emails = {len(documents)}") + + return documents + + +def generate_synthetic_instructions( + documents: List[Document], writer: PanzaWriter, batch_size: int, output_path: str +) -> None: + num_processed_documents = 0 + num_batches = (len(documents) - 1) // batch_size + 1 + start_time = time.time() + with open(output_path, "w") as f: + for i in tqdm(range(0, len(documents), batch_size)): + print(f"--> Processing batch {i // batch_size + 1}/{num_batches}") + batch = documents[i : i + batch_size] + # TODO: Rename .email to .content + instructions = [ + SummarizationInstruction(instruction=document.email) for document in batch + ] + + summaries = writer.run_batch(instructions) + num_processed_documents += len(summaries) + + for it, summary in enumerate(summaries): + # TODO: Add cleaning and filtering + batch[it].summary = summary + + # Write the summarized documents to a file + for document in batch: + f.write(json.dumps(document.serialize())) + f.write("\n") + + elapsed_time = time.time() - start_time + LOGGER.info(f"--> Processed {num_processed_documents} documents in {elapsed_time:.2f} seconds.") + + +def check_if_file_exists(cfg: DictConfig) -> None: + output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + if os.path.exists(output_path) and not cfg.force: + LOGGER.warning( + "Summaries already exists, program will close. " + "If you want to regenerate use the flag force=true." + ) + sys.exit(0) + + +@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_preparation") +def main(cfg: DictConfig) -> None: + LOGGER.info("Running Panza Data Preparation") + LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + + # Skip running if summaries already exist + check_if_file_exists(cfg) + + # Rename config keys to follow class structure + rename_config_keys(cfg) + + # Instantiate Panza writer + writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) + assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter" + + # Instantiate retriever + retriever: DocumentRetriever = hydra.utils.instantiate(cfg.retriever) + assert isinstance(retriever, DocumentRetriever), "Failed to instantiate DocumentRetriever" + retriever.set_document_class(Email) + + # Load documentas + documents = load_documents(cfg.data_path) + # TODO: Add custom resolver for output path and add it in config + output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + generate_synthetic_instructions( + documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path + ) + + # TODO: Create vector store + + +if __name__ == "__main__": + main() diff --git a/src/panza3/entities/__init__.py b/src/panza3/entities/__init__.py index f306513..99266aa 100644 --- a/src/panza3/entities/__init__.py +++ b/src/panza3/entities/__init__.py @@ -1,4 +1,4 @@ from .document import Document, Email -from .instruction import EmailInstruction, Instruction +from .instruction import EmailInstruction, Instruction, SummarizationInstruction -__all__ = ["Document", "Email", "EmailInstruction", "Instruction"] +__all__ = ["Document", "Email", "EmailInstruction", "Instruction", "SummarizationInstruction"] diff --git a/src/panza3/entities/instruction.py b/src/panza3/entities/instruction.py index 544f5f5..f622329 100644 --- a/src/panza3/entities/instruction.py +++ b/src/panza3/entities/instruction.py @@ -14,3 +14,8 @@ class Instruction(ABC): @dataclass(kw_only=True) class EmailInstruction(Instruction): thread: List[str] = field(default_factory=list) + + +@dataclass(kw_only=True) +class SummarizationInstruction(Instruction): + pass diff --git a/src/panza3/prompting/__init__.py b/src/panza3/prompting/__init__.py index 4257c7c..ab79ffd 100644 --- a/src/panza3/prompting/__init__.py +++ b/src/panza3/prompting/__init__.py @@ -1,4 +1,5 @@ from .base import PromptBuilder from .email_prompting import EmailPromptBuilder +from .summarization_prompting import SummarizationPromptBuilder -__all__ = ["PromptBuilder", "EmailPromptBuilder"] +__all__ = ["PromptBuilder", "EmailPromptBuilder", "SummarizationPromptBuilder"] diff --git a/src/panza3/prompting/summarization_prompting.py b/src/panza3/prompting/summarization_prompting.py new file mode 100644 index 0000000..e1e28fb --- /dev/null +++ b/src/panza3/prompting/summarization_prompting.py @@ -0,0 +1,19 @@ +from ..entities import SummarizationInstruction +from .base import PromptBuilder + + +class SummarizationPromptBuilder(PromptBuilder): + def __init__( + self, + summarization_prompt: str, + ): + self.summarization_prompt = summarization_prompt + + def build_prompt( + self, + instruction: SummarizationInstruction, + ) -> str: + + prompt = self.summarization_prompt.format(email=instruction.instruction).strip() + + return prompt diff --git a/src/panza3/retriever/faiss.py b/src/panza3/retriever/faiss.py index fcdfa9a..898851c 100644 --- a/src/panza3/retriever/faiss.py +++ b/src/panza3/retriever/faiss.py @@ -88,6 +88,7 @@ def store(self, documents: List[Document], chunk_size: int, chunk_overlap: int): if self.db: self.db.merge_from(db) else: + LOGGER.info(f"Creating new Faiss index {self.index_name} in {self.db_path}.") self.db = db def save_db_to_disk(self): From 4fb5c8dff5aa8bc27d054e7a9bbc5dd74074bfe5 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:58:26 +0200 Subject: [PATCH 062/112] add sampling parameters to ollama LLM --- src/panza3/llm/ollama.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index b7f803c..e6f7223 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -18,7 +18,7 @@ def __init__(self, name: str, gguf_file: str, sampling_parameters: Dict): """ super().__init__(name, sampling_parameters) self.gguf_file = gguf_file - self.sampling_params = sampling_parameters + self.sampling_parameters = sampling_parameters if not self._is_ollama_running(): self._start_ollama() @@ -48,13 +48,24 @@ def _is_model_loaded(self) -> bool: return True return False + def _make_modelfile_parameters(self) -> str: + if self.sampling_parameters is None or self.sampling_parameters["do_sample"] == False: + return "" + return f""" + PARAMETER temperature {self.sampling_parameters["temperature"]} + PARAMETER top_k {self.sampling_parameters["top_k"]} + PARAMETER top_p {self.sampling_parameters["top_p"]} + PARAMETER num_predict {self.sampling_parameters["max_new_tokens"]} + """ + def _load_model(self) -> None: - # TODO: Add sampling parameters to the model file modelfile = f""" FROM {self.gguf_file} + {self._make_modelfile_parameters()} """ try: ollama.create(model={self.name}, modelfile=modelfile, stream=True) + print("Loaded a new mode into Ollama.") except: raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") From 21a3fcf50aff9b4f8085bb15ddbe0c7fbec4a5d4 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 12 Sep 2024 09:21:35 +0200 Subject: [PATCH 063/112] refactor configs to get unnecessary params out of configs/base --- configs/base.yaml | 11 +---------- configs/finetuning/base.yaml | 3 ++- configs/finetuning/full.yaml | 3 ++- configs/finetuning/rosa.yaml | 3 ++- configs/panza_finetuning.yaml | 7 +++++++ src/panza3/finetuning/train.py | 8 +++++--- 6 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 configs/panza_finetuning.yaml diff --git a/configs/base.yaml b/configs/base.yaml index 8a6012c..0c5d85e 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -1,18 +1,9 @@ defaults: - user: default - - finetuning: full - - writer/prompting/email_prompting@preprocessing.prompting panza_workspace: ${hydra:runtime.cwd}/../ checkpoint_dir: ${panza_workspace}/checkpoints seed: 42 -model: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" embedding_model: "sentence-transformers/all-mpnet-base-v2" -model_precision: bf16 # bf16 or fp32 - -preprocessing: - model: ${model} - - -wandb_disabled: false \ No newline at end of file +model_precision: bf16 # bf16 or fp32 \ No newline at end of file diff --git a/configs/finetuning/base.yaml b/configs/finetuning/base.yaml index bb548ff..ee5b3d8 100644 --- a/configs/finetuning/base.yaml +++ b/configs/finetuning/base.yaml @@ -1,6 +1,7 @@ +wandb_disabled: false + max_seq_len: 512 global_seed: ${seed} -model_name_or_path: ${model} load_path: # set via bash script to be absolute path to your sparse checkpoint precision: amp_bf16 diff --git a/configs/finetuning/full.yaml b/configs/finetuning/full.yaml index 92e939c..bc4c9be 100644 --- a/configs/finetuning/full.yaml +++ b/configs/finetuning/full.yaml @@ -6,7 +6,8 @@ max_duration: 3ep lr: 1e-5 batch_size: 8 eval_interval: 1 -seed: 42 +seed: ${seed} +model_name_or_path: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" fsdp_config: sharding_strategy: FULL_SHARD diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index ac27984..6abaa4f 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -5,7 +5,8 @@ max_duration: 5ep lr: 1e-5 batch_size: 8 eval_interval: 1 -seed: 42 +seed: ${seed} +model_name_or_path: "ISTA-DASLab/Meta-Llama-3-8B-Instruct" rosa: lora_lr: ${finetuning.lr} diff --git a/configs/panza_finetuning.yaml b/configs/panza_finetuning.yaml new file mode 100644 index 0000000..3cab7a6 --- /dev/null +++ b/configs/panza_finetuning.yaml @@ -0,0 +1,7 @@ + +defaults: + - base + - finetuning: full + # For preprocessing (i.e., assembling the LLM prompt, inherit the defaults + # from the writer (the inference module).) + - writer/prompting/email_prompting@preprocessing.prompting \ No newline at end of file diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 8c0630d..ee1133c 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -137,7 +137,7 @@ def create_run_name(cfg: DictConfig) -> str: run_name = f"panza_{cfg.user.username}" - model_name = cfg.model.split("/")[-1] + model_name = cfg.finetuning.model_name_or_path.split("/")[-1] run_name += f"-{model_name}" run_name += f"-{cfg.model_precision}" @@ -334,7 +334,7 @@ def build_composer_peft_model( # model = ModelComposerHFCausalLM(model, tokenizer) return model -@hydra.main(version_base="1.1", config_path="../../../configs", config_name="base") +@hydra.main(version_base="1.1", config_path="../../../configs", config_name="panza_finetuning") def main(cfg: DictConfig) -> Trainer: override_config(cfg) @@ -345,6 +345,8 @@ def main(cfg: DictConfig) -> Trainer: # and accessed through an environment variable. Note that this # happens separately for each process (however, a collision should) # not be a problem, since the configs are the same. + OmegaConf.set_struct(cfg, False) + cfg.preprocessing.model = cfg.finetuning.model_name_or_path preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) #create_checkpoint_dirs(cfg) @@ -352,7 +354,7 @@ def main(cfg: DictConfig) -> Trainer: # I don't think we need this, since panza is loaded with pip. #environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" - environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) + environment["WANDB_DISABLED"] = str(int(cfg.finetuning.wandb_disabled)) environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml cfg = cfg.finetuning From 9a1c9c2e7f8d3f1903eca55866ea74e299252e8f Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 12 Sep 2024 09:28:24 +0200 Subject: [PATCH 064/112] allow code execution during model loading to allow phi3.5 --- src/panza3/finetuning/preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/panza3/finetuning/preprocessing.py b/src/panza3/finetuning/preprocessing.py index 346bae2..874584c 100644 --- a/src/panza3/finetuning/preprocessing.py +++ b/src/panza3/finetuning/preprocessing.py @@ -14,8 +14,8 @@ preprocessing_config = OmegaConf.load(PREPROCESSING_CONFIG_FILE) prompt_builder = hydra.utils.instantiate(preprocessing_config.prompting) - # Load tokenizer - config = AutoConfig.from_pretrained(preprocessing_config.model) + # Load tokenizer. The trust_remote_code parameter is necessary to load Phi-3.5. + config = AutoConfig.from_pretrained(preprocessing_config.model, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained( preprocessing_config.model, model_max_length=config.max_position_embeddings ) @@ -27,7 +27,7 @@ def panza_preprocessing_function(inputs: Dict) -> Dict: instruction = EmailInstruction(instruction=prompt_raw, thread=inputs.get("thread", [])) prompt = prompt_builder.build_prompt(instruction) - print(f"Prompt: {prompt}") + #print(f"Prompt: {prompt}") # Generate the full conversation conversation = [ From 3c44172c4d5b88a54ecf19a23d0b7e1388783046 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 12 Sep 2024 09:33:36 +0200 Subject: [PATCH 065/112] greatly simplify the .sh training script to take advantage of the configs --- scripts/train_fft.sh | 160 +-------------------------- scripts/train_rosa.sh | 246 ++---------------------------------------- 2 files changed, 8 insertions(+), 398 deletions(-) diff --git a/scripts/train_fft.sh b/scripts/train_fft.sh index 6938d7c..d79c5b0 100755 --- a/scripts/train_fft.sh +++ b/scripts/train_fft.sh @@ -1,162 +1,6 @@ set -e -source config.sh - current_user=$(whoami) -export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl - -# hyper-parameters with default values -#export MODEL_PRECISION=bf16 # bf16 or fp32 -export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the model -export NUM_EPOCHS=3 -export WARMUP=20 # the learning rate warmup (batches) -export BS=8 -export PER_DEVICE_BS=1 -export SEED=${PANZA_SEED} - -if [[ ${MODEL_TYPE} == llama3 ]]; then - export LR=1e-5 # learning rate -elif [[ ${MODEL_TYPE} == mistralv2 ]]; then - export LR=1e-5 # learning rate -elif [[ ${MODEL_TYPE} == phi3 ]]; then - export LR=1e-5 # learning rate -else - echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2, llama3 and phi3 models. Exiting." - exit -fi - -export PRETRAINED=${PANZA_GENERATIVE_MODEL} -export CONFIG=${PANZA_FINETUNE_CONFIGS}/fft_panza.yaml - -# take all the input arguments and put them in environment variables -# this could override the hyper-parameters defined above -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -echo "Using Learning Rate ${LR} for ${MODEL_TYPE} model" - -export WANDB_PROJECT="panza-${PANZA_USERNAME}" - -if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then - PREAMBLE_STR="-PREAMBLE" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble -elif [ "$PANZA_FINETUNE_WITH_THREAD" = 1 ]; then - PREAMBLE_STR="-THREAD" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_thread -else - PREAMBLE_STR="" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function -fi - -if [ "$PANZA_FINETUNE_WITH_RAG" = 1 ]; then - RAFT_STR=-RAFT_num${PANZA_FINETUNE_RAG_NUM_EMAILS}_prob${PANZA_FINETUNE_RAG_PROB}_th${PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD} -else - RAFT_STR="" -fi - -# some post-processing on the inputs -export MAX_DURATION=${NUM_EPOCHS}ep -export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM - -# create directories to save the models -mkdir -p ${BASE_SAVE_PATH}/models/ - -TEMP_FILE=$(mktemp) - -if [ "$MODEL_PRECISION" = "bf16" ]; then - export HF_SAVE_PRECISION=bfloat16 -elif [ "$MODEL_PRECISION" = "fp32" ]; then - export HF_SAVE_PRECISION=float32 -else - echo "Unknown model precision $MODEL_PRECISION" - exit 1 -fi - -export WANDB_DISABLED=${PANZA_WANDB_DISABLED} -TRAIN_SCRIPT=${PANZA_WORKSPACE}/src/panza/finetuning/train.py -composer ${TRAIN_SCRIPT} \ - ${CONFIG} \ - model_name_or_path=${PRETRAINED} \ - train_loader.dataset.hf_kwargs.data_files=${DATA_PATH} \ - train_loader.dataset.preprocessing_fn=${PREPROCESSING_FN} \ - max_duration=${MAX_DURATION} \ - run_name=${RUN_NAME} \ - optimizer.lr=${LR} \ - global_train_batch_size=${BS} \ - device_train_microbatch_size=${PER_DEVICE_BS} \ - device_eval_batch_size=${PER_DEVICE_BS} \ - scheduler.t_warmup=${WARMUP}ba \ - model.weight_bias_dtype=${MODEL_PRECISION} \ - global_seed=${SEED} \ - seed=${SEED} \ - callbacks.hf_checkpointer.precision=${HF_SAVE_PRECISION} \ - hf_save_path=${BASE_SAVE_PATH}/models/ 2>&1 | tee "$TEMP_FILE" - -# Extract the wandb run ID from the temp file -WANDB_RUN_ID=$(grep -o 'https://wandb.ai/[^ ]*/runs/[^ ]*' "$TEMP_FILE" | awk -F'/' '{print $NF}' | tail -n 1) - -rm "$TEMP_FILE" - -# move the checkpoint (saved by llm-foundry) to the correct directory -export RUN_SAVE_PATH=${BASE_SAVE_PATH}/models/${RUN_NAME} -export LAST_SAVE_DIR_NAME=$(ls -t ${RUN_SAVE_PATH}/huggingface | head -n 1) -mv ${RUN_SAVE_PATH}/huggingface/${LAST_SAVE_DIR_NAME}/* ${RUN_SAVE_PATH} -rm -rf ${RUN_SAVE_PATH}/huggingface - -echo "find the finetuned model at ${BASE_SAVE_PATH}/models/${RUN_NAME}" - -if [ -z "$WANDB_RUN_ID" ]; then - echo "No wandb run ID found." -else - echo "Extracted wandb run ID: $WANDB_RUN_ID" -fi - -# Running BLEU evaluation -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} - -# Running BLEU evaluation with thread -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - --use-thread - -# Running BLEU evaluation with RAG -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --use-rag - -echo "find the finetuned model at ${BASE_SAVE_PATH}/models/${RUN_NAME}" +composer ../src/panza3/finetuning/train.py \ + user=${current_user} finetuning=full \ No newline at end of file diff --git a/scripts/train_rosa.sh b/scripts/train_rosa.sh index d39d41c..5f81bd8 100755 --- a/scripts/train_rosa.sh +++ b/scripts/train_rosa.sh @@ -1,245 +1,11 @@ set -e -source config.sh - current_user=$(whoami) -export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl - -# hyper-parameters with default values -export MASK_GEN_MODEL_PRECISION=${MODEL_PRECISION} # bf16, fp32, or 4bit -export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the checkpoints and generated masks -export NUM_EPOCHS=5 -export WARMUP=8 # the learning rate warmup (batches) -export BS=8 -export PER_DEVICE_BS=1 -export LORA_ALPHA=16 -export SCHEDULE=wl16 # the RoSA schedule -export SPA_NUM_GRADS=1 # number of gradients used for mask generation -export SPA_GRAD_ACC_MODE=mean_squared # 'mean' or 'mean_squared': how to accumulate gradients -export SEED=${PANZA_SEED} - -if [[ ${MODEL_TYPE} == llama3 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -elif [[ ${MODEL_TYPE} == mistralv2 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -elif [[ ${MODEL_TYPE} == phi3 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -else - echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2, llama3 and phi3 models. Exiting." - exit -fi - -# hyper-parameters without default values -export SPA_DENSITY=0.01 # the sparse adapters' density -export LORA_R=8 # the low-rank adapters' rank - -export PRETRAINED=${PANZA_GENERATIVE_MODEL} -export CONFIG=${PANZA_FINETUNE_CONFIGS}/rosa_panza.yaml -export NUM_CPU_THREADS=0 # useful for running of CPU, 0 means default the used by torch - -export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" # if not set, default to 0 - -# take all the input arguments and put them in environment variables -# this could override the hyper-parameters defined above -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -echo "Using Learning Rate ${LR} and LoRA LR ${LORA_LR} for ${MODEL_TYPE} model" - -export WANDB_PROJECT="panza-${PANZA_USERNAME}" - -if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then - PREAMBLE_STR="-PREAMBLE" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble -else - PREAMBLE_STR="" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function -fi - -if [ "$PANZA_FINETUNE_WITH_RAG" = 1 ]; then - RAFT_STR=-RAFT_num${PANZA_FINETUNE_RAG_NUM_EMAILS}_prob${PANZA_FINETUNE_RAG_PROB}_th${PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD} -else - RAFT_STR="" -fi - -# some post-processing on the inputs -export MAX_DURATION=${NUM_EPOCHS}ep -export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-rosa_${SCHEDULE}_d${SPA_DENSITY}_${SPA_NUM_GRADS}grads_${SPA_GRAD_ACC_MODE}_r${LORA_R}_loralr${LORA_LR}_alpha${LORA_ALPHA}-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM - -# create directories to save the masks and models -mkdir -p ${BASE_SAVE_PATH}/masks/ -mkdir -p ${BASE_SAVE_PATH}/models/ - -if [ "$MODEL_PRECISION" = "bf16" ]; then - export ROSA_DTYPE=bf16 -elif [ "$MODEL_PRECISION" = "4bit" ]; then - export ROSA_DTYPE=fp32 -elif [ "$MODEL_PRECISION" = "fp32" ]; then - export ROSA_DTYPE=fp32 -else - echo "Unknown model precision $MODEL_PRECISION" - exit 1 -fi - -if [[ "$SPA_DENSITY" != "0" ]] -then - # sparse adaptation exists, so we need to generate masks - - if [[ $LORA_R == 0 ]] - then - export SCHEDULE=spa_only - fi - - # no wandb logging for mask generation - export WANDB_DISABLED=True - - # generate the masks and terminate - TRAIN_SCRIPT=${PANZA_WORKSPACE}/src/panza/finetuning/train.py - composer ${TRAIN_SCRIPT} \ - ${CONFIG} \ - model_name_or_path=${PRETRAINED} \ - num_cpu_threads=${NUM_CPU_THREADS} \ - train_loader.dataset.hf_kwargs.data_files=${DATA_PATH} \ - train_loader.dataset.preprocessing_fn=${PREPROCESSING_FN} \ - max_duration=${MAX_DURATION} \ - run_name=${RUN_NAME} \ - optimizer.lr=${LR} \ - global_train_batch_size=${BS} \ - device_train_microbatch_size=${PER_DEVICE_BS} \ - device_eval_batch_size=${PER_DEVICE_BS} \ - scheduler.t_warmup=${WARMUP}ba \ - model.weight_bias_dtype=${MASK_GEN_MODEL_PRECISION} \ - rosa.spa_d=${SPA_DENSITY} \ - rosa.spa_num_grads=${SPA_NUM_GRADS} \ - rosa.grad_acc_mode=${SPA_GRAD_ACC_MODE} \ - rosa.lora_r=${LORA_R} \ - rosa.lora_alpha=${LORA_ALPHA} \ - rosa.lora_lr=${LORA_LR} \ - rosa.schedule=${SCHEDULE} \ - rosa.rosa_dtype=${ROSA_DTYPE} \ - global_seed=${SEED} \ - seed=${SEED} \ - hf_save_path=${BASE_SAVE_PATH}/models/ \ - rosa.mask_save_path=${BASE_SAVE_PATH}/masks/${RUN_NAME} \ - rosa.terminate_after_mask_generation=true -fi - -# now we have the masks ready, so let's restart -export MASK_LOAD_PATH=${BASE_SAVE_PATH}/masks/${RUN_NAME} - -# determine the correct RoSA schedule -if [[ "$SPA_DENSITY" != "0" && $LORA_R -ne 0 ]] -then - export SCHEDULE=default -elif [[ $LORA_R -ne 0 ]] -then - export SCHEDULE=lora_only - export MASK_LOAD_PATH= -else - export SCHEDULE=spa_only -fi - -TEMP_FILE=$(mktemp) - -export WANDB_DISABLED=${PANZA_WANDB_DISABLED} -# start the training with both sparse and low-rank adapters active from the outset -TRAIN_SCRIPT=${PANZA_WORKSPACE}/src/panza/finetuning/train.py -composer ${TRAIN_SCRIPT} \ - ${CONFIG} \ - model_name_or_path=${PRETRAINED} \ - num_cpu_threads=${NUM_CPU_THREADS} \ - train_loader.dataset.hf_kwargs.data_files=${DATA_PATH} \ - train_loader.dataset.preprocessing_fn=${PREPROCESSING_FN} \ - max_duration=${MAX_DURATION} \ - run_name=${RUN_NAME} \ - optimizer.lr=${LR} \ - global_train_batch_size=${BS} \ - device_train_microbatch_size=${PER_DEVICE_BS} \ - device_eval_batch_size=${PER_DEVICE_BS} \ - scheduler.t_warmup=${WARMUP}ba \ - model.weight_bias_dtype=${MODEL_PRECISION} \ - rosa.spa_d=${SPA_DENSITY} \ - rosa.spa_num_grads=${SPA_NUM_GRADS} \ - rosa.grad_acc_mode=${SPA_GRAD_ACC_MODE} \ - rosa.lora_r=${LORA_R} \ - rosa.lora_alpha=${LORA_ALPHA} \ - rosa.lora_lr=${LORA_LR} \ - rosa.schedule=${SCHEDULE} \ - rosa.rosa_dtype=${ROSA_DTYPE} \ - global_seed=${SEED} \ - seed=${SEED} \ - hf_save_path=${BASE_SAVE_PATH}/models/ \ - rosa.mask_load_path=${MASK_LOAD_PATH} 2>&1 | tee "$TEMP_FILE" - -# Extract the wandb run ID from the temp file -WANDB_RUN_ID=$(grep -o 'https://wandb.ai/[^ ]*/runs/[^ ]*' "$TEMP_FILE" | awk -F'/' '{print $NF}' | tail -n 1) - -# Clean up -rm "$TEMP_FILE" -rm -rf "$MASK_LOAD_PATH" - -echo "find the adapter at ${BASE_SAVE_PATH}/models/${RUN_NAME}" - -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -if [ -z "$WANDB_RUN_ID" ]; then - echo "No wandb run ID found." -else - echo "Extracted wandb run ID: $WANDB_RUN_ID" -fi - -# Running BLEU evaluation -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - ${USE_4BIT_QUANT} - -# Running BLEU evaluation with thread -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - --use-thread \ - ${USE_4BIT_QUANT} - -# Running BLEU evaluation with RAG -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --thread-preamble=${PANZA_THREAD_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --use-rag \ - ${USE_4BIT_QUANT} +# First create the masks for RoSA finetuning. +composer ../src/panza3/finetuning/train.py \ + user=${current_user} finetuning=rosa finetuning.rosa.masks_only=true -echo "find the adapter at ${BASE_SAVE_PATH}/models/${RUN_NAME}" +# Then train the weights. +composer ../src/panza3/finetuning/train.py \ + user=${current_user} finetuning=rosa finetuning.rosa.masks_only=false \ No newline at end of file From 684990339bec4aaca355dc8fb2b694589109e6d3 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:29:40 +0200 Subject: [PATCH 066/112] add streaming to cli --- src/panza3/interface/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/panza3/interface/cli.py b/src/panza3/interface/cli.py index 6ad11b5..94166a2 100644 --- a/src/panza3/interface/cli.py +++ b/src/panza3/interface/cli.py @@ -11,5 +11,6 @@ def __init__(self, writer: PanzaWriter, **kwargs): break else: instruction: Instruction = EmailInstruction(user_input) - response = self.writer.run(instruction) - print(response) + stream = self.writer.run(instruction, stream=True) + for chunk in stream: + print(chunk, end="") From 764f9806cb3d0f095e4705414696bfa8f9a9f03c Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:30:00 +0200 Subject: [PATCH 067/112] add streaming to transformers --- src/panza3/llm/local.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py index bc345e8..9aa781b 100644 --- a/src/panza3/llm/local.py +++ b/src/panza3/llm/local.py @@ -12,7 +12,7 @@ _MISSING_LIBRARIES.append("peft") try: - from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer except ImportError: AutoModelForCausalLM = None AutoTokenizer = None @@ -93,8 +93,25 @@ def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: if isinstance(messages[0], (list, tuple)) or hasattr(messages[0], "messages"): raise TypeError("chat_stream does not support batched messages.") - # TODO: Implement chat_stream. - raise NotImplementedError("chat_stream is not implemented for LocalLLM.") + streamer = TextStreamer(self.tokenizer) + encodeds = self.tokenizer.apply_chat_template( + messages, + return_tensors="pt", + add_generation_prompt=True, + padding=True, + truncation=True, + return_dict=True, + ) + model_inputs = encodeds.to(self.device) + + self.model.generate( + model_inputs, + streamer=streamer, + **self.sampling_parameters, + pad_token_id=self.tokenizer.pad_token_id, + ) + + return streamer def _check_installation(self) -> None: if AutoModelForCausalLM is None or AutoTokenizer is None: From 50dcde279f1d50899383eadd0d5b760b34af7579 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 13 Sep 2024 11:48:04 +0200 Subject: [PATCH 068/112] update training scripts to process arguments correctly --- scripts/train_fft.sh | 34 ++++++++++++++++++++++++++++++++-- scripts/train_rosa.sh | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/scripts/train_fft.sh b/scripts/train_fft.sh index d79c5b0..69b65d0 100755 --- a/scripts/train_fft.sh +++ b/scripts/train_fft.sh @@ -1,6 +1,36 @@ +# Convenience script for running full finetuning. +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# ./train_fft.sh user=alonso trainer.optimizer.lr=0.1 + set -e -current_user=$(whoami) +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + elif [[ $key == finetuning ]]; then + echo "The 'finetuning' argument is already set and should not be overridden here; override is ignored." + elif [[ $key == finetuning.rosa.masks_only ]]; then + echo "The 'finetuning.rosa.masks_only' argument is already set and should not be overridden here; override is ignored." + else + vars[idx]=$argument + idx+=1 + fi +done composer ../src/panza3/finetuning/train.py \ - user=${current_user} finetuning=full \ No newline at end of file + finetuning=full ${vars[@]} \ No newline at end of file diff --git a/scripts/train_rosa.sh b/scripts/train_rosa.sh index 5f81bd8..07d1e03 100755 --- a/scripts/train_rosa.sh +++ b/scripts/train_rosa.sh @@ -1,11 +1,41 @@ +# Convenience script for running RoSA finetuning. +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# ./train_rosa.sh user=alonso trainer.optimizer.lr=0.1 + set -e -current_user=$(whoami) +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + elif [[ $key == finetuning ]]; then + echo "The 'finetuning' argument is already set and should not be overridden here; override is ignored." + elif [[ $key == finetuning.rosa.masks_only ]]; then + echo "The 'finetuning.rosa.masks_only' argument is already set and should not be overridden here; override is ignored." + else + vars[idx]=$argument + idx+=1 + fi +done # First create the masks for RoSA finetuning. composer ../src/panza3/finetuning/train.py \ - user=${current_user} finetuning=rosa finetuning.rosa.masks_only=true + finetuning=rosa finetuning.rosa.masks_only=true ${vars[@]} # Then train the weights. composer ../src/panza3/finetuning/train.py \ - user=${current_user} finetuning=rosa finetuning.rosa.masks_only=false \ No newline at end of file + finetuning=rosa finetuning.rosa.masks_only=false ${vars[@]} From 693701893abe1eb4efd116961437f91cd558ce42 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 13 Sep 2024 11:51:32 +0200 Subject: [PATCH 069/112] minor fix for train_fft --- scripts/train_fft.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/train_fft.sh b/scripts/train_fft.sh index 69b65d0..7e06e85 100755 --- a/scripts/train_fft.sh +++ b/scripts/train_fft.sh @@ -24,8 +24,6 @@ do vars[1]=$argument elif [[ $key == finetuning ]]; then echo "The 'finetuning' argument is already set and should not be overridden here; override is ignored." - elif [[ $key == finetuning.rosa.masks_only ]]; then - echo "The 'finetuning.rosa.masks_only' argument is already set and should not be overridden here; override is ignored." else vars[idx]=$argument idx+=1 From 0dc314ac3d9e0651bfb66676e98ddbce5a8e5903 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 13 Sep 2024 13:38:28 +0200 Subject: [PATCH 070/112] write the full checkpoint to the expected location --- src/panza3/finetuning/train.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index ee1133c..2b1e0a7 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -3,8 +3,10 @@ import copy import gc +import glob import logging import os +import shutil import tempfile import time import warnings @@ -939,6 +941,16 @@ def main(cfg: DictConfig) -> Trainer: log.info('Starting training...') trainer.fit() + # Hacky solution for moving the model checkpoint from the + # subdirectory that the HF writer wrote it into, and into + # our desired and expected location. + if torch.distributed.get_rank() == 0: + path_to_save = os.path.join(hf_save_path, run_name) + hf_output_path = os.path.join(path_to_save, "huggingface") + for filename in glob.glob(os.path.join(hf_output_path, "*", "*")): + shutil.copy(filename, path_to_save) + shutil.rmtree(os.path.join(hf_output_path)) + # if rosa is enabled, save the model manually, since # llm-foundry's checkpointing doesn't work properly with RoSA if rosa_config is not None: From 9fa93f0bf70efb139c8edf78ec860fc8a85d0dd4 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:58:26 +0200 Subject: [PATCH 071/112] add sampling parameters to ollama LLM --- src/panza3/llm/ollama.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/panza3/llm/ollama.py b/src/panza3/llm/ollama.py index b7f803c..e6f7223 100644 --- a/src/panza3/llm/ollama.py +++ b/src/panza3/llm/ollama.py @@ -18,7 +18,7 @@ def __init__(self, name: str, gguf_file: str, sampling_parameters: Dict): """ super().__init__(name, sampling_parameters) self.gguf_file = gguf_file - self.sampling_params = sampling_parameters + self.sampling_parameters = sampling_parameters if not self._is_ollama_running(): self._start_ollama() @@ -48,13 +48,24 @@ def _is_model_loaded(self) -> bool: return True return False + def _make_modelfile_parameters(self) -> str: + if self.sampling_parameters is None or self.sampling_parameters["do_sample"] == False: + return "" + return f""" + PARAMETER temperature {self.sampling_parameters["temperature"]} + PARAMETER top_k {self.sampling_parameters["top_k"]} + PARAMETER top_p {self.sampling_parameters["top_p"]} + PARAMETER num_predict {self.sampling_parameters["max_new_tokens"]} + """ + def _load_model(self) -> None: - # TODO: Add sampling parameters to the model file modelfile = f""" FROM {self.gguf_file} + {self._make_modelfile_parameters()} """ try: ollama.create(model={self.name}, modelfile=modelfile, stream=True) + print("Loaded a new mode into Ollama.") except: raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") From ddd6041d910aee154dda675b168f3411139b8e09 Mon Sep 17 00:00:00 2001 From: Armand Nicolicioiu Date: Wed, 11 Sep 2024 13:49:12 +0200 Subject: [PATCH 072/112] Refactor data summarization --- configs/panza_preparation.yaml | 10 ++ .../prompting/summarization_prompting.yaml | 3 + configs/writer/summary.yaml | 5 + .../summarization_prompt.txt | 0 scripts/prepare_data.py | 119 ++++++++++++++++++ src/panza3/entities/__init__.py | 4 +- src/panza3/entities/instruction.py | 5 + src/panza3/prompting/__init__.py | 3 +- .../prompting/summarization_prompting.py | 19 +++ src/panza3/retriever/faiss.py | 1 + 10 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 configs/panza_preparation.yaml create mode 100644 configs/writer/prompting/summarization_prompting.yaml create mode 100644 configs/writer/summary.yaml rename {src/panza/data_preparation => prompt_preambles}/summarization_prompt.txt (100%) create mode 100644 scripts/prepare_data.py create mode 100644 src/panza3/prompting/summarization_prompting.py diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml new file mode 100644 index 0000000..54b001b --- /dev/null +++ b/configs/panza_preparation.yaml @@ -0,0 +1,10 @@ +defaults: + - base + - user: armand + - writer: summary + - writer/prompting/retriever/faiss@retriever + +batch_size: 8 +data_path: ${user.data_dir}/${user.username}_clean.jsonl +checkpoint: "microsoft/Phi-3-mini-4k-instruct" +force: false \ No newline at end of file diff --git a/configs/writer/prompting/summarization_prompting.yaml b/configs/writer/prompting/summarization_prompting.yaml new file mode 100644 index 0000000..6e44871 --- /dev/null +++ b/configs/writer/prompting/summarization_prompting.yaml @@ -0,0 +1,3 @@ +_target_: panza3.prompting.SummarizationPromptBuilder + +summarization_prompt: ${load_preamble:${panza_workspace}/prompt_preambles/summarization_prompt.txt} diff --git a/configs/writer/summary.yaml b/configs/writer/summary.yaml new file mode 100644 index 0000000..76d8f83 --- /dev/null +++ b/configs/writer/summary.yaml @@ -0,0 +1,5 @@ +defaults: + - llm: transformers + - prompting: summarization_prompting + +_target_: panza3.writer.PanzaWriter diff --git a/src/panza/data_preparation/summarization_prompt.txt b/prompt_preambles/summarization_prompt.txt similarity index 100% rename from src/panza/data_preparation/summarization_prompt.txt rename to prompt_preambles/summarization_prompt.txt diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py new file mode 100644 index 0000000..8674660 --- /dev/null +++ b/scripts/prepare_data.py @@ -0,0 +1,119 @@ +import json +import logging +import os +import sys +import time +from typing import List + +import hydra +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers +from panza3.entities import Document, Email, SummarizationInstruction +from panza3.retriever import DocumentRetriever + +LOGGER = logging.getLogger(__name__) + + +def rename_config_keys(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + cfg.writer.llm.sampling_parameters = cfg.writer.llm.sampling + del cfg.writer.llm.sampling + + cfg.writer.prompt_builder = cfg.writer.prompting + del cfg.writer.prompting + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +def load_documents(data_path: str) -> None: + assert data_path.endswith(".jsonl"), f"Expecting a .jsonl file, but given = {data_path}" + + LOGGER.info(f"--> Reading emails from: {data_path}") + + with open(data_path, "r") as f: + lines = f.readlines() + documents = [Email.deserialize(line.strip(",")) for line in lines] + print(f"--> # emails = {len(documents)}") + + return documents + + +def generate_synthetic_instructions( + documents: List[Document], writer: PanzaWriter, batch_size: int, output_path: str +) -> None: + num_processed_documents = 0 + num_batches = (len(documents) - 1) // batch_size + 1 + start_time = time.time() + with open(output_path, "w") as f: + for i in tqdm(range(0, len(documents), batch_size)): + print(f"--> Processing batch {i // batch_size + 1}/{num_batches}") + batch = documents[i : i + batch_size] + # TODO: Rename .email to .content + instructions = [ + SummarizationInstruction(instruction=document.email) for document in batch + ] + + summaries = writer.run_batch(instructions) + num_processed_documents += len(summaries) + + for it, summary in enumerate(summaries): + # TODO: Add cleaning and filtering + batch[it].summary = summary + + # Write the summarized documents to a file + for document in batch: + f.write(json.dumps(document.serialize())) + f.write("\n") + + elapsed_time = time.time() - start_time + LOGGER.info(f"--> Processed {num_processed_documents} documents in {elapsed_time:.2f} seconds.") + + +def check_if_file_exists(cfg: DictConfig) -> None: + output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + if os.path.exists(output_path) and not cfg.force: + LOGGER.warning( + "Summaries already exists, program will close. " + "If you want to regenerate use the flag force=true." + ) + sys.exit(0) + + +@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_preparation") +def main(cfg: DictConfig) -> None: + LOGGER.info("Running Panza Data Preparation") + LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + + # Skip running if summaries already exist + check_if_file_exists(cfg) + + # Rename config keys to follow class structure + rename_config_keys(cfg) + + # Instantiate Panza writer + writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) + assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter" + + # Instantiate retriever + retriever: DocumentRetriever = hydra.utils.instantiate(cfg.retriever) + assert isinstance(retriever, DocumentRetriever), "Failed to instantiate DocumentRetriever" + retriever.set_document_class(Email) + + # Load documentas + documents = load_documents(cfg.data_path) + # TODO: Add custom resolver for output path and add it in config + output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + generate_synthetic_instructions( + documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path + ) + + # TODO: Create vector store + + +if __name__ == "__main__": + main() diff --git a/src/panza3/entities/__init__.py b/src/panza3/entities/__init__.py index f306513..99266aa 100644 --- a/src/panza3/entities/__init__.py +++ b/src/panza3/entities/__init__.py @@ -1,4 +1,4 @@ from .document import Document, Email -from .instruction import EmailInstruction, Instruction +from .instruction import EmailInstruction, Instruction, SummarizationInstruction -__all__ = ["Document", "Email", "EmailInstruction", "Instruction"] +__all__ = ["Document", "Email", "EmailInstruction", "Instruction", "SummarizationInstruction"] diff --git a/src/panza3/entities/instruction.py b/src/panza3/entities/instruction.py index 544f5f5..f622329 100644 --- a/src/panza3/entities/instruction.py +++ b/src/panza3/entities/instruction.py @@ -14,3 +14,8 @@ class Instruction(ABC): @dataclass(kw_only=True) class EmailInstruction(Instruction): thread: List[str] = field(default_factory=list) + + +@dataclass(kw_only=True) +class SummarizationInstruction(Instruction): + pass diff --git a/src/panza3/prompting/__init__.py b/src/panza3/prompting/__init__.py index 4257c7c..ab79ffd 100644 --- a/src/panza3/prompting/__init__.py +++ b/src/panza3/prompting/__init__.py @@ -1,4 +1,5 @@ from .base import PromptBuilder from .email_prompting import EmailPromptBuilder +from .summarization_prompting import SummarizationPromptBuilder -__all__ = ["PromptBuilder", "EmailPromptBuilder"] +__all__ = ["PromptBuilder", "EmailPromptBuilder", "SummarizationPromptBuilder"] diff --git a/src/panza3/prompting/summarization_prompting.py b/src/panza3/prompting/summarization_prompting.py new file mode 100644 index 0000000..e1e28fb --- /dev/null +++ b/src/panza3/prompting/summarization_prompting.py @@ -0,0 +1,19 @@ +from ..entities import SummarizationInstruction +from .base import PromptBuilder + + +class SummarizationPromptBuilder(PromptBuilder): + def __init__( + self, + summarization_prompt: str, + ): + self.summarization_prompt = summarization_prompt + + def build_prompt( + self, + instruction: SummarizationInstruction, + ) -> str: + + prompt = self.summarization_prompt.format(email=instruction.instruction).strip() + + return prompt diff --git a/src/panza3/retriever/faiss.py b/src/panza3/retriever/faiss.py index fcdfa9a..898851c 100644 --- a/src/panza3/retriever/faiss.py +++ b/src/panza3/retriever/faiss.py @@ -88,6 +88,7 @@ def store(self, documents: List[Document], chunk_size: int, chunk_overlap: int): if self.db: self.db.merge_from(db) else: + LOGGER.info(f"Creating new Faiss index {self.index_name} in {self.db_path}.") self.db = db def save_db_to_disk(self): From 91a64f0bc6e744e45f18544520240e0320d69225 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:29:40 +0200 Subject: [PATCH 073/112] add streaming to cli --- src/panza3/interface/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/panza3/interface/cli.py b/src/panza3/interface/cli.py index 6ad11b5..94166a2 100644 --- a/src/panza3/interface/cli.py +++ b/src/panza3/interface/cli.py @@ -11,5 +11,6 @@ def __init__(self, writer: PanzaWriter, **kwargs): break else: instruction: Instruction = EmailInstruction(user_input) - response = self.writer.run(instruction) - print(response) + stream = self.writer.run(instruction, stream=True) + for chunk in stream: + print(chunk, end="") From 41d27c3f9c56501bab7ecb6cf757276698e411de Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Thu, 12 Sep 2024 13:30:00 +0200 Subject: [PATCH 074/112] add streaming to transformers --- src/panza3/llm/local.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py index bc345e8..9aa781b 100644 --- a/src/panza3/llm/local.py +++ b/src/panza3/llm/local.py @@ -12,7 +12,7 @@ _MISSING_LIBRARIES.append("peft") try: - from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer except ImportError: AutoModelForCausalLM = None AutoTokenizer = None @@ -93,8 +93,25 @@ def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: if isinstance(messages[0], (list, tuple)) or hasattr(messages[0], "messages"): raise TypeError("chat_stream does not support batched messages.") - # TODO: Implement chat_stream. - raise NotImplementedError("chat_stream is not implemented for LocalLLM.") + streamer = TextStreamer(self.tokenizer) + encodeds = self.tokenizer.apply_chat_template( + messages, + return_tensors="pt", + add_generation_prompt=True, + padding=True, + truncation=True, + return_dict=True, + ) + model_inputs = encodeds.to(self.device) + + self.model.generate( + model_inputs, + streamer=streamer, + **self.sampling_parameters, + pad_token_id=self.tokenizer.pad_token_id, + ) + + return streamer def _check_installation(self) -> None: if AutoModelForCausalLM is None or AutoTokenizer is None: From 66cf2d4c5042baf0b12802b8a01d0f59871657a4 Mon Sep 17 00:00:00 2001 From: Sean Yang <53060248+shawseanyang@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:12:21 +0200 Subject: [PATCH 075/112] update web.py to match LLM interface --- src/panza3/interface/web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/panza3/interface/web.py b/src/panza3/interface/web.py index 26065ba..59a5786 100644 --- a/src/panza3/interface/web.py +++ b/src/panza3/interface/web.py @@ -42,7 +42,7 @@ def _get_valid_api_keys(self) -> List[str]: def _streamer(self, stream): for chunk in stream: - yield chunk["message"]["content"] + yield chunk def _predict(self, input: str) -> Generator: instruction: Instruction = EmailInstruction(input) From 0c456b53a0c19f038f296d98a6aec48cc8f2c69a Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 16 Sep 2024 15:20:32 +0200 Subject: [PATCH 076/112] first pass at porting evaluation to new framework --- configs/interfaces/json.yaml | 9 ++ src/panza3/interface/__init__.py | 3 +- src/panza3/interface/json.py | 136 +++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 configs/interfaces/json.yaml create mode 100644 src/panza3/interface/json.py diff --git a/configs/interfaces/json.yaml b/configs/interfaces/json.yaml new file mode 100644 index 0000000..ed9996b --- /dev/null +++ b/configs/interfaces/json.yaml @@ -0,0 +1,9 @@ +input_file: /nfs/scistore19/alistgrp/eiofinov/PanzaMail/data/test_small.jsonl +batch_size: 8 +use_thread: false +responses_per_prompt: 1 +checkpoint: ${checkpoint} +panza_workspace: ${panza_workspace} +compute_metrics: true +user: ${user.username} +_target_: panza3.interface.PanzaJSON \ No newline at end of file diff --git a/src/panza3/interface/__init__.py b/src/panza3/interface/__init__.py index 9448a5f..6d25a4b 100644 --- a/src/panza3/interface/__init__.py +++ b/src/panza3/interface/__init__.py @@ -1,5 +1,6 @@ from .web import PanzaWebService from .cli import PanzaCLI from .gui import PanzaGUI +from .json import PanzaJSON -__all__ = ["PanzaWebService", "PanzaCLI", "PanzaGUI"] +__all__ = ["PanzaWebService", "PanzaCLI", "PanzaGUI", "PanzaJSON"] diff --git a/src/panza3/interface/json.py b/src/panza3/interface/json.py new file mode 100644 index 0000000..87cefaf --- /dev/null +++ b/src/panza3/interface/json.py @@ -0,0 +1,136 @@ +from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.writer import PanzaWriter + +import json +import numpy as np +import os +import re + + +from evaluate import load +from torchmetrics.text.bleu import BLEUScore +from torchmetrics.text.rouge import ROUGEScore +import string +punc_table = str.maketrans({key: None for key in string.punctuation}) +rouge = ROUGEScore() +bleu1 = BLEUScore(n_gram=1) +bleu2 = BLEUScore(n_gram=2) +bleu3 = BLEUScore(n_gram=3) +bleu4 = BLEUScore(n_gram=4) +mauve = load('mauve') + +def compute_rouge_scores(predictions, goldens): + goldens= [" ".join(x.translate(punc_table).lower().split()) for x in goldens] + candidates = [" ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions] + scores = [{k: v.item() for k, v in rouge(candidate, goldens).items()} for candidate in candidates] + return scores + +def compute_bleu_scores(predictions, goldens): + goldens= [" ".join(x.translate(punc_table).lower().split()) for x in goldens] + candidates = [" ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions] + bleu_scores = [np.mean([bleu([candidate], [goldens]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) for candidate in candidates] + return [s.item() for s in bleu_scores] + +def compute_mauve_score(predictions, goldens): + predictions = [prediction for nested_prediction in predictions for prediction in nested_prediction] + goldens = [golden for nested_golden in goldens for golden in nested_golden] + mauve_score = mauve.compute(predictions=predictions, references=goldens) + return mauve_score + + +class PanzaJSON: + + def compose_output_folder(self, checkpoint, panza_workspace): + if os.path.isdir(checkpoint): + output_dir = checkpoint + else: # Assume that this is a huggingface model + output_dir = os.path.join(panza_workspace, "checkpoints", "models", checkpoint) + os.makedirs(output_dir, exist_ok=True) + return os.path.join(output_dir, "panza_outputs.json") + + + def assemble_responses(self, prompts_json, batch_size, use_thread, + responses_per_prompt, compute_metrics): + + with open (prompts_json, "r") as f: + golden_lines = [json.loads(l) for l in f.readlines()] + + grouped_golden = {} + for entry in golden_lines: + if entry["summary"] in grouped_golden: + grouped_golden[entry["summary"]]["templates"].append(entry["email"]) + else: + grouped_golden[entry["summary"]] = {} + grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] + grouped_golden[entry["summary"]]["thread"] = entry["thread"] + grouped_golden = list(grouped_golden.items()) + + all_responses = [] + for i in range(0, len(grouped_golden), batch_size): + batch = grouped_golden[i:i + batch_size] + prompts = [item[0] for item in batch] + if use_thread: + threads = [item[1]["thread"] for item in batch] + golden_responses = [item[1]["templates"] for item in batch] + + responses = [{"prompt": p, + "full_prompt": "", + "thread": None if not use_thread else threads[i], + "golden_responses": golden_responses[i], + "panza_responses": []} for i, p in enumerate(prompts)] + for _ in range(responses_per_prompt): + if use_thread: + instructions = list(zip(prompts, threads)) + else: + instructions = list(zip(prompts, [None]*len(prompts))) + + outputs, full_prompts = self.writer.run_batch([EmailInstruction(user_input) for user_input in instructions], return_prompt=True) + # Remove some boilerplate added by instruction-tuned models w/out finetuning. + outputs = [o.replace("Here is the email:\n", "") for o in outputs] + outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] + + for i, r in enumerate(responses): + r["full_prompt"] = full_prompts[i] + r["panza_responses"].append(outputs[i]) + all_responses += responses + + if compute_metrics: + for response in all_responses: + response["scores"] = {} + response["scores"]["ROUGE"] = compute_rouge_scores( + response["panza_responses"], + response["golden_responses"]) + response["scores"]["BLEU"] = compute_bleu_scores( + response["panza_responses"], + response["golden_responses"]) + rouge_categories = all_responses[0]["scores"]["ROUGE"][0].keys() + aggregate_metrics = { + "BLEU": np.mean([s for r in all_responses for s in r["scores"]["BLEU"]]), + "ROUGE": {cat: + np.mean([ + s[cat] for r in all_responses for s in r["scores"]["ROUGE"]]) + for cat in rouge_categories}, + "MAUVE": compute_mauve_score([r["panza_responses"] for r in all_responses], + [r["golden_responses"] for r in all_responses]).mauve + } + return {"responses": all_responses, "aggregate_metrics": aggregate_metrics} + + + def __init__(self, writer: PanzaWriter, + checkpoint: str, + panza_workspace: str, + input_file: str, + batch_size: int, + use_thread: bool, + responses_per_prompt: int, + compute_metrics: bool, + user: str): + self.writer = writer + responses = self.assemble_responses(input_file, batch_size, + use_thread, responses_per_prompt, + compute_metrics) + output_path = self.compose_output_folder(checkpoint, panza_workspace) + with open(output_path, 'w') as f: + json.dump(responses, f, indent=4, sort_keys=True) From 2e20f12c2e22b67dd3f0388160ce637b2a5d23d1 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 16 Sep 2024 15:56:14 +0200 Subject: [PATCH 077/112] do NOT split test and train data by default --- scripts/prepare_dataset.sh | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/scripts/prepare_dataset.sh b/scripts/prepare_dataset.sh index a10e324..3e0799f 100755 --- a/scripts/prepare_dataset.sh +++ b/scripts/prepare_dataset.sh @@ -2,7 +2,7 @@ source config.sh -TRAIN_RATIO=0.8 +TRAIN_RATIO=1.0 SPLIT_TYPE="chronological" # random or chronological CHUNK_SIZE=3000 @@ -21,6 +21,8 @@ do export "$KEY"="$VALUE" done + + USE_4BIT_QUANT=$([ "${LOAD_IN_4BIT}" = 1 ] && echo "--load-in-4bit" || echo "") USE_FP32_COMPUTE=$([ "${RUN_FP32}" = 1 ] && echo "--fp32" || echo "") @@ -30,15 +32,31 @@ python ../src/panza/data_preparation/summarize_emails.py \ --prompt-file="${PANZA_WORKSPACE}/src/panza/data_preparation/summarization_prompt.txt" \ --batch-size=${PANZA_SUMMARIZATION_BATCH_SIZE} ${USE_4BIT_QUANT} ${USE_FP32_COMPUTE} && -# Create train and test splits -python ../src/panza/data_preparation/split_data.py \ - --data-path="${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ - --output-data-dir=${PANZA_DATA_DIR} \ - --train-ratio=${TRAIN_RATIO} \ - --split-type=${SPLIT_TYPE} \ - --seed=${PANZA_SEED} && +if [[ $TRAIN_RATIO < 1.0 ]]; then + # Create train and test splits + SPLIT_PANZA_DATA_DIR=${PANZA_DATA_DIR}/split + + python ../src/panza/data_preparation/split_data.py \ + --data-path="${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ + --output-data-dir=${PANZA_DATA_DIR}/split \ + --train-ratio=${TRAIN_RATIO} \ + --split-type=${SPLIT_TYPE} \ + --seed=${PANZA_SEED} + + PANZA_DATA_DIR=$SPLIT_PANZA_DATA_DIR +else + cp "${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ + "${PANZA_DATA_DIR}/train.jsonl" + + # Finetuning requires some sort of test set, just use the training + # data again. + cp "${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ + "${PANZA_DATA_DIR}/test.jsonl" +fi # Create vector store with emails embeddings +# Note that if the data is split, then the PANZA_DATA_DIR, +# where the vector store will be, will be the /split directory. python ../src/panza/data_preparation/create_vector_store.py \ --path-to-emails="${PANZA_DATA_DIR}/train.jsonl" \ --chunk-size=${CHUNK_SIZE} \ From 298d023df0753c03e045177e87e92870ab0c4f5f Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 17 Sep 2024 11:41:33 +0200 Subject: [PATCH 078/112] make the json interface more robust to json file format --- configs/interfaces/json.yaml | 4 +- src/panza3/interface/json.py | 102 ++++++++++++++++++++++------------- 2 files changed, 67 insertions(+), 39 deletions(-) diff --git a/configs/interfaces/json.yaml b/configs/interfaces/json.yaml index ed9996b..4a51fbc 100644 --- a/configs/interfaces/json.yaml +++ b/configs/interfaces/json.yaml @@ -1,9 +1,9 @@ -input_file: /nfs/scistore19/alistgrp/eiofinov/PanzaMail/data/test_small.jsonl +input_file: ${panza_workspace}/data/test.jsonl batch_size: 8 use_thread: false responses_per_prompt: 1 checkpoint: ${checkpoint} panza_workspace: ${panza_workspace} compute_metrics: true -user: ${user.username} +username: ${user.username} _target_: panza3.interface.PanzaJSON \ No newline at end of file diff --git a/src/panza3/interface/json.py b/src/panza3/interface/json.py index 87cefaf..ae22f49 100644 --- a/src/panza3/interface/json.py +++ b/src/panza3/interface/json.py @@ -40,29 +40,46 @@ def compute_mauve_score(predictions, goldens): class PanzaJSON: - def compose_output_folder(self, checkpoint, panza_workspace): + def compose_output_folder(self, json_path, checkpoint, panza_workspace, username): if os.path.isdir(checkpoint): + # Presumably this is a Panza-trained model; go ahead + # and put the json output into the same folder. output_dir = checkpoint - else: # Assume that this is a huggingface model - output_dir = os.path.join(panza_workspace, "checkpoints", "models", checkpoint) + else: + # Assume that this is a huggingface model identified by its hf handle. + # We don't want to populate the cached model folder, so instead + # we create a folder in the Panza workspace to put the output. + output_dir = os.path.join(panza_workspace, "checkpoints", "models", checkpoint, username) os.makedirs(output_dir, exist_ok=True) - return os.path.join(output_dir, "panza_outputs.json") + filename_no_ext = os.path.splitext(os.path.basename(json_path))[0] + return os.path.join(output_dir, f"{filename_no_ext}_outputs.json") def assemble_responses(self, prompts_json, batch_size, use_thread, - responses_per_prompt, compute_metrics): + responses_per_prompt): with open (prompts_json, "r") as f: golden_lines = [json.loads(l) for l in f.readlines()] + # Group json lines together by prompt to avoid weirdness in + # eval metric computation. In case golden responses are provided, + # all goldens are used as alternatives for BLEU and ROUGE scores; + # the first one provided is used for MAUVE. grouped_golden = {} + has_goldens = False for entry in golden_lines: + # 'summary' is the name of the 'prompt' field, i.e., the one to group on. if entry["summary"] in grouped_golden: - grouped_golden[entry["summary"]]["templates"].append(entry["email"]) + if 'email' in entry: + has_goldens = True + grouped_golden[entry["summary"]]["goldens"].append(entry["email"]) else: grouped_golden[entry["summary"]] = {} - grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] + if 'email' in entry: + has_goldens = True + grouped_golden[entry["summary"]]["goldens"] = [(entry["email"])] grouped_golden[entry["summary"]]["thread"] = entry["thread"] + # Convert dict to list of (k, v) pairs to batch through it. grouped_golden = list(grouped_golden.items()) all_responses = [] @@ -71,13 +88,13 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, prompts = [item[0] for item in batch] if use_thread: threads = [item[1]["thread"] for item in batch] - golden_responses = [item[1]["templates"] for item in batch] + golden_responses = [item[1]["goldens"] for item in batch] responses = [{"prompt": p, - "full_prompt": "", - "thread": None if not use_thread else threads[i], - "golden_responses": golden_responses[i], - "panza_responses": []} for i, p in enumerate(prompts)] + "full_prompt": None, + "thread": None if not use_thread else threads[i], + "golden_responses": golden_responses[i], + "panza_responses": []} for i, p in enumerate(prompts)] for _ in range(responses_per_prompt): if use_thread: instructions = list(zip(prompts, threads)) @@ -85,6 +102,7 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, instructions = list(zip(prompts, [None]*len(prompts))) outputs, full_prompts = self.writer.run_batch([EmailInstruction(user_input) for user_input in instructions], return_prompt=True) + # Remove some boilerplate added by instruction-tuned models w/out finetuning. outputs = [o.replace("Here is the email:\n", "") for o in outputs] outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] @@ -95,26 +113,29 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, r["full_prompt"] = full_prompts[i] r["panza_responses"].append(outputs[i]) all_responses += responses - - if compute_metrics: - for response in all_responses: - response["scores"] = {} - response["scores"]["ROUGE"] = compute_rouge_scores( - response["panza_responses"], - response["golden_responses"]) - response["scores"]["BLEU"] = compute_bleu_scores( - response["panza_responses"], - response["golden_responses"]) - rouge_categories = all_responses[0]["scores"]["ROUGE"][0].keys() - aggregate_metrics = { - "BLEU": np.mean([s for r in all_responses for s in r["scores"]["BLEU"]]), - "ROUGE": {cat: - np.mean([ - s[cat] for r in all_responses for s in r["scores"]["ROUGE"]]) - for cat in rouge_categories}, - "MAUVE": compute_mauve_score([r["panza_responses"] for r in all_responses], - [r["golden_responses"] for r in all_responses]).mauve - } + return all_responses, has_goldens + + def do_compute_metrics(self, all_responses): + for response in all_responses: + response["scores"] = {} + response["scores"]["BLEU"] = compute_bleu_scores( + response["panza_responses"], + response["golden_responses"]) + response["scores"]["ROUGE"] = compute_rouge_scores( + response["panza_responses"], + response["golden_responses"]) + rouge_categories = all_responses[0]["scores"]["ROUGE"][0].keys() + aggregate_metrics = { + "BLEU": np.mean([s for r in all_responses for s in r["scores"]["BLEU"]]), + "ROUGE": {cat: + np.mean([ + s[cat] for r in all_responses for s in r["scores"]["ROUGE"]]) + for cat in rouge_categories}, + "MAUVE": compute_mauve_score([r["panza_responses"] for r in all_responses], + [r["golden_responses"] for r in all_responses]).mauve + } + print("########## Aggregated quality metrics ##########\n") + print(json.dumps(aggregate_metrics, indent=2)) return {"responses": all_responses, "aggregate_metrics": aggregate_metrics} @@ -126,11 +147,18 @@ def __init__(self, writer: PanzaWriter, use_thread: bool, responses_per_prompt: int, compute_metrics: bool, - user: str): + username: str): self.writer = writer - responses = self.assemble_responses(input_file, batch_size, - use_thread, responses_per_prompt, - compute_metrics) - output_path = self.compose_output_folder(checkpoint, panza_workspace) + responses, has_goldens = self.assemble_responses(input_file, batch_size, + use_thread, responses_per_prompt) + if compute_metrics: + if has_goldens: + responses = self.do_compute_metrics(responses) + else: + print("Warning: metrics requested but no golden labels given!", + "\nDumping responses without computing metrics.") + + output_path = self.compose_output_folder( + input_file, checkpoint, panza_workspace, username) with open(output_path, 'w') as f: json.dump(responses, f, indent=4, sort_keys=True) From 02e0322798679cec4ed544b61609dccd2e1a6e3f Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 19 Sep 2024 15:01:39 +0200 Subject: [PATCH 079/112] fix bug where RoSA FFT still tries to move folder over --- src/panza3/finetuning/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 2b1e0a7..ecdd038 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -943,8 +943,9 @@ def main(cfg: DictConfig) -> Trainer: # Hacky solution for moving the model checkpoint from the # subdirectory that the HF writer wrote it into, and into - # our desired and expected location. - if torch.distributed.get_rank() == 0: + # our desired and expected location. Only needed for full + # (not low-rank) finetuning. + if rosa_config is None and torch.distributed.get_rank() == 0: path_to_save = os.path.join(hf_save_path, run_name) hf_output_path = os.path.join(path_to_save, "huggingface") for filename in glob.glob(os.path.join(hf_output_path, "*", "*")): From d4b70f863ccbba787f43e918a280a8315997a789 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Thu, 19 Sep 2024 16:58:03 +0200 Subject: [PATCH 080/112] emergency bugfix for streaming to cli --- src/panza3/interface/cli.py | 3 +-- src/panza3/llm/local.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/panza3/interface/cli.py b/src/panza3/interface/cli.py index 94166a2..bcf03a3 100644 --- a/src/panza3/interface/cli.py +++ b/src/panza3/interface/cli.py @@ -12,5 +12,4 @@ def __init__(self, writer: PanzaWriter, **kwargs): else: instruction: Instruction = EmailInstruction(user_input) stream = self.writer.run(instruction, stream=True) - for chunk in stream: - print(chunk, end="") + stream.end() diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py index 9aa781b..20bc656 100644 --- a/src/panza3/llm/local.py +++ b/src/panza3/llm/local.py @@ -105,7 +105,7 @@ def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: model_inputs = encodeds.to(self.device) self.model.generate( - model_inputs, + **model_inputs, streamer=streamer, **self.sampling_parameters, pad_token_id=self.tokenizer.pad_token_id, From 3314abee50669f28c4921a2c0f812c8e3ab829eb Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 12:05:12 +0200 Subject: [PATCH 081/112] add creating RAG DB to data preparation script --- configs/panza_preparation.yaml | 6 +- scripts/prepare_data.py | 5 +- src/panza3/data_preparation/rag.py | 119 +++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 src/panza3/data_preparation/rag.py diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml index 54b001b..93563db 100644 --- a/configs/panza_preparation.yaml +++ b/configs/panza_preparation.yaml @@ -1,10 +1,12 @@ defaults: - base - - user: armand - writer: summary - writer/prompting/retriever/faiss@retriever batch_size: 8 data_path: ${user.data_dir}/${user.username}_clean.jsonl checkpoint: "microsoft/Phi-3-mini-4k-instruct" -force: false \ No newline at end of file +force: false +rag_embedding_chunk_size: 3000 +rag_embedding_chunk_overlap: 3000 +rag_embedding_model: "sentence-transformers/all-mpnet-base-v2" \ No newline at end of file diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 8674660..2c1cbd7 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -12,6 +12,7 @@ from panza3 import PanzaWriter # The import also loads custom Hydra resolvers from panza3.entities import Document, Email, SummarizationInstruction from panza3.retriever import DocumentRetriever +from panza3.data_preparation.rag import create_vector_store LOGGER = logging.getLogger(__name__) @@ -104,7 +105,7 @@ def main(cfg: DictConfig) -> None: assert isinstance(retriever, DocumentRetriever), "Failed to instantiate DocumentRetriever" retriever.set_document_class(Email) - # Load documentas + # Load documents documents = load_documents(cfg.data_path) # TODO: Add custom resolver for output path and add it in config output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" @@ -112,7 +113,7 @@ def main(cfg: DictConfig) -> None: documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path ) - # TODO: Create vector store + create_vector_store(output_path, cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, os.path.dirname(cfg.data_path), cfg.user.username, cfg.rag_embedding_model) if __name__ == "__main__": diff --git a/src/panza3/data_preparation/rag.py b/src/panza3/data_preparation/rag.py new file mode 100644 index 0000000..649c281 --- /dev/null +++ b/src/panza3/data_preparation/rag.py @@ -0,0 +1,119 @@ +import copy +import json +import time +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import Dict, List, Optional, Union + +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore +from langchain.text_splitter import RecursiveCharacterTextSplitter + +@dataclass(kw_only=True) +class Email(ABC): + email: str + subject: str + thread: List[str] + summary: Optional[str] = None + date: datetime + + def serialize(self) -> dict: + dictionary = asdict(self) + dictionary["date"] = self.date.isoformat() + return dictionary + + @classmethod + def deserialize(cls, data: Union[str, Dict]) -> "Email": + if isinstance(data, str): + dictionary = json.loads(data) + elif isinstance(data, dict): + dictionary = copy.deepcopy(data) + else: + raise ValueError(f"Cannot deserialize data of type {type(data)}. Must be str or dict.") + dictionary["date"] = datetime.fromisoformat(dictionary["date"]) + return cls(**dictionary) + + +def get_embeddings_model(model_name) -> Embeddings: + embeddings_model = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs={"device": "cpu"}, + encode_kwargs={"normalize_embeddings": False}, + ) + return embeddings_model + + +def create_vector_db(docs: List[Document], embeddings_model: Embeddings) -> VectorStore: + db = FAISS.from_documents(docs, embeddings_model) + return db + + +def load_vector_db_from_disk( + folder_path: str, index_name: str, embeddings_model: Embeddings +) -> VectorStore: + try: + db = FAISS.load_local( + folder_path=folder_path, + embeddings=embeddings_model, + index_name=index_name, + allow_dangerous_deserialization=True, # Allows pickle deserialization + ) + print("Faiss index loaded ") + return db + except Exception as e: + print("Fiass index loading failed \n", e) + + +def load_emails(path: str) -> List[Email]: + with open(path, "r") as f: + lines = f.readlines() + + emails = [Email.deserialize(line) for line in lines] + + return emails + + +def process_emails(emails: List[Email], chunk_size: int, chunk_overlap: int) -> List[Document]: + # Convert e-mails to langchain documents + documents = [ + Document(page_content=email.email, metadata={"serialized_email": email.serialize()}) + for email in emails + ] + + # Split long e-mails into text chunks + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) + documents = text_splitter.split_documents(documents) + + return documents + + + +def create_vector_store(path_to_emails, chunk_size, chunk_overlap, db_path, index_name, embedding_model): + + # Load emails + emails = load_emails(path_to_emails) + print(f"Loaded {len(emails)} emails.") + + # Process emails + documents = process_emails(emails, chunk_size, chunk_overlap) + print(f"Obtained {len(documents)} text chunks.") + + # Initialize embeddings model + embeddings_model = get_embeddings_model(embedding_model) + + # Create vector DB + print("Creating vector DB...") + start = time.time() + db = create_vector_db(documents, embeddings_model) + print(f"Vector DB created in {time.time() - start} seconds.") + + # Save vector DB to disk + db.save_local(folder_path=db_path, index_name=index_name) + print(f"Vector DB index {index_name} saved to {db_path}.") + From ab96feae641928cb724ed52d0899bd4cd6ee2997 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 14:13:45 +0200 Subject: [PATCH 082/112] add test-train splitting to data preparation --- configs/panza_preparation.yaml | 6 +++++ scripts/prepare_data.py | 41 +++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml index 93563db..89e87e7 100644 --- a/configs/panza_preparation.yaml +++ b/configs/panza_preparation.yaml @@ -7,6 +7,12 @@ batch_size: 8 data_path: ${user.data_dir}/${user.username}_clean.jsonl checkpoint: "microsoft/Phi-3-mini-4k-instruct" force: false + + # Parameters for train-test split, if required. +test_split: 0. +split_type: random # For test-train split. Options are 'random', 'chronological'. + +# Parameters for RAG database. rag_embedding_chunk_size: 3000 rag_embedding_chunk_overlap: 3000 rag_embedding_model: "sentence-transformers/all-mpnet-base-v2" \ No newline at end of file diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 2c1cbd7..2c18dec 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -1,6 +1,9 @@ +import datetime import json import logging import os +import random +import shutil import sys import time from typing import List @@ -85,6 +88,35 @@ def check_if_file_exists(cfg: DictConfig) -> None: sys.exit(0) +def split_and_write_data(cfg): + data_dir = os.path.dirname(cfg.data_path) + if cfg.test_split == 0: + shutil.copy(cfg.data_path, os.path.join(data_dir, "train.jsonl")) + # Bad hack - we need test data for the training to work. + shutil.copy(cfg.data_path, os.path.join(data_dir, "test.jsonl")) + else: + with open(cfg.data_path, "r") as f: + data = f.readlines() + if cfg.split_type == "random": + random.seed(cfg.seed) + random.shuffle(data) + elif cfg.split_type == "chronological": + data = sorted(data, key=lambda x: datetime.fromisoformat(json.loads(x)["date"])) + else: + raise ValueError("Invalid split type.") + + train_size = int(len(data) * 1-cfg.test_split) + + with open(os.path.join(data_dir, "train.jsonl"), "w") as f: + for i in range(train_size): + f.write(data[i]) + + with open(os.path.join(data_dir, "test.jsonl"), "w") as f: + for i in range(train_size, len(data)): + f.write(data[i]) + + + @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_preparation") def main(cfg: DictConfig) -> None: LOGGER.info("Running Panza Data Preparation") @@ -109,9 +141,12 @@ def main(cfg: DictConfig) -> None: documents = load_documents(cfg.data_path) # TODO: Add custom resolver for output path and add it in config output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" - generate_synthetic_instructions( - documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path - ) + # generate_synthetic_instructions( + # documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path + # ) + + # Write the test data to test.jsonl, with an optional train-test split + split_and_write_data(cfg) create_vector_store(output_path, cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, os.path.dirname(cfg.data_path), cfg.user.username, cfg.rag_embedding_model) From e3f12e06b65ce3df1f51e0b94ed19147beab4a93 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 14:16:21 +0200 Subject: [PATCH 083/112] undo accidental commenting out --- scripts/prepare_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 2c18dec..a32f372 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -141,9 +141,9 @@ def main(cfg: DictConfig) -> None: documents = load_documents(cfg.data_path) # TODO: Add custom resolver for output path and add it in config output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" - # generate_synthetic_instructions( - # documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path - # ) + generate_synthetic_instructions( + documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path + ) # Write the test data to test.jsonl, with an optional train-test split split_and_write_data(cfg) From dbcc5d037b35c5fa93e4a3bd3c1d84dffe1f54b4 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 16:50:34 +0200 Subject: [PATCH 084/112] bug fix --- scripts/prepare_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index a32f372..b99a8d5 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -88,14 +88,14 @@ def check_if_file_exists(cfg: DictConfig) -> None: sys.exit(0) -def split_and_write_data(cfg): +def split_and_write_data(summarized_data_path,cfg): data_dir = os.path.dirname(cfg.data_path) if cfg.test_split == 0: - shutil.copy(cfg.data_path, os.path.join(data_dir, "train.jsonl")) + shutil.copy(summarized_data_path, os.path.join(data_dir, "train.jsonl")) # Bad hack - we need test data for the training to work. - shutil.copy(cfg.data_path, os.path.join(data_dir, "test.jsonl")) + shutil.copy(summarized_data_path, os.path.join(data_dir, "test.jsonl")) else: - with open(cfg.data_path, "r") as f: + with open(summarized_data_path, "r") as f: data = f.readlines() if cfg.split_type == "random": random.seed(cfg.seed) @@ -146,7 +146,7 @@ def main(cfg: DictConfig) -> None: ) # Write the test data to test.jsonl, with an optional train-test split - split_and_write_data(cfg) + split_and_write_data(output_path, cfg) create_vector_store(output_path, cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, os.path.dirname(cfg.data_path), cfg.user.username, cfg.rag_embedding_model) From dfbde00349ffe0c74c2b24acb0cec7b0f2a4d200 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 16:51:34 +0200 Subject: [PATCH 085/112] add tqdm to json interface --- src/panza3/interface/json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/panza3/interface/json.py b/src/panza3/interface/json.py index ae22f49..2a64e77 100644 --- a/src/panza3/interface/json.py +++ b/src/panza3/interface/json.py @@ -5,6 +5,7 @@ import numpy as np import os import re +from tqdm import tqdm from evaluate import load @@ -18,6 +19,7 @@ bleu3 = BLEUScore(n_gram=3) bleu4 = BLEUScore(n_gram=4) mauve = load('mauve') +from tqdm import tqdm def compute_rouge_scores(predictions, goldens): goldens= [" ".join(x.translate(punc_table).lower().split()) for x in goldens] @@ -83,7 +85,7 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, grouped_golden = list(grouped_golden.items()) all_responses = [] - for i in range(0, len(grouped_golden), batch_size): + for i in tqdm(range(0, len(grouped_golden), batch_size)): batch = grouped_golden[i:i + batch_size] prompts = [item[0] for item in batch] if use_thread: From eb720d19e6446e9cbfb0ed14522b3ac3b9d8c252 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Fri, 18 Oct 2024 17:36:06 +0200 Subject: [PATCH 086/112] let panza writer load latest checkpoint --- configs/panza_writer.yaml | 7 +++++-- scripts/runner.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/configs/panza_writer.yaml b/configs/panza_writer.yaml index 62434f5..2faf9f2 100644 --- a/configs/panza_writer.yaml +++ b/configs/panza_writer.yaml @@ -2,8 +2,11 @@ defaults: - base - writer: email - interfaces: - - gui + # - gui # - cli # - web + - json -checkpoint: '/nfs/scistore19/alistgrp/eiofinov/PanzaMail/data/jen_model' \ No newline at end of file +# Either a full path to the checkpoint, or the 'latest' tag, +# Which looks for the latest checkpoint in `checkpoint_dir` +checkpoint: 'latest' \ No newline at end of file diff --git a/scripts/runner.py b/scripts/runner.py index 9c860b5..0d5137f 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -1,6 +1,8 @@ import logging +import glob import hydra +import os from omegaconf import DictConfig, OmegaConf from panza3 import PanzaWriter # The import also loads custom Hydra resolvers @@ -21,6 +23,12 @@ def rename_config_keys(cfg: DictConfig) -> None: # Re-enable struct mode to lock down the configuration OmegaConf.set_struct(cfg, True) + +def get_latest_model(cfg: DictConfig) -> None: + model_files = glob.glob(f'{cfg.checkpoint_dir}/models/*') # * means all if need specific format then *.csv + latest_file = max(model_files, key=os.path.getctime) + return latest_file + @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_writer") @@ -29,7 +37,11 @@ def main(cfg: DictConfig) -> None: LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) # Rename config keys to follow class structure + + # Find the latest checkpoint, if requested. rename_config_keys(cfg) + if cfg.checkpoint == "latest": + cfg.checkpoint = get_latest_model(cfg) # Instantiate Panza writer writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) From d7b298fd6745aae441c01f77d2d24aafecc0930d Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 21 Oct 2024 12:25:03 +0200 Subject: [PATCH 087/112] add email extraction to data preparation --- configs/panza_preparation.yaml | 13 +- scripts/prepare_data.py | 33 ++-- src/panza3/data_preparation/extract_emails.py | 185 ++++++++++++++++++ 3 files changed, 213 insertions(+), 18 deletions(-) create mode 100644 src/panza3/data_preparation/extract_emails.py diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml index 89e87e7..449df89 100644 --- a/configs/panza_preparation.yaml +++ b/configs/panza_preparation.yaml @@ -4,13 +4,20 @@ defaults: - writer/prompting/retriever/faiss@retriever batch_size: 8 -data_path: ${user.data_dir}/${user.username}_clean.jsonl + +email_dump_path: ${user.data_dir}/Sent.mbox +cleaned_emails_path: ${user.data_dir}/${user.username}_emails_clean.jsonl +discarded_emails_dir: ${user.data_dir}/${user.username}/discarded_emails +summarized_emails_path: ${user.data_dir}/${user.username}_emails_clean_summarized.jsonl + +rag_db_dir: ${user.data_dir} + checkpoint: "microsoft/Phi-3-mini-4k-instruct" -force: false +force: false # If false, data will not be recreated if it already exists. # Parameters for train-test split, if required. test_split: 0. -split_type: random # For test-train split. Options are 'random', 'chronological'. +split_type: random # Options are 'random', 'chronological'. # Parameters for RAG database. rag_embedding_chunk_size: 3000 diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index b99a8d5..d650cff 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -15,6 +15,7 @@ from panza3 import PanzaWriter # The import also loads custom Hydra resolvers from panza3.entities import Document, Email, SummarizationInstruction from panza3.retriever import DocumentRetriever +from panza3.data_preparation.extract_emails import extract_emails from panza3.data_preparation.rag import create_vector_store LOGGER = logging.getLogger(__name__) @@ -79,8 +80,7 @@ def generate_synthetic_instructions( def check_if_file_exists(cfg: DictConfig) -> None: - output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" - if os.path.exists(output_path) and not cfg.force: + if os.path.exists(cfg.cleaned_emails_path) and not cfg.force: LOGGER.warning( "Summaries already exists, program will close. " "If you want to regenerate use the flag force=true." @@ -88,14 +88,13 @@ def check_if_file_exists(cfg: DictConfig) -> None: sys.exit(0) -def split_and_write_data(summarized_data_path,cfg): - data_dir = os.path.dirname(cfg.data_path) +def split_and_write_data(cfg): if cfg.test_split == 0: - shutil.copy(summarized_data_path, os.path.join(data_dir, "train.jsonl")) + shutil.copy(cfg.summarized_emails_path, os.path.join(cfg.user.data_dir, "train.jsonl")) # Bad hack - we need test data for the training to work. - shutil.copy(summarized_data_path, os.path.join(data_dir, "test.jsonl")) + shutil.copy(cfg.summarized_emails_path, os.path.join(cfg.user.data_dir, "test.jsonl")) else: - with open(summarized_data_path, "r") as f: + with open(cfg.summarized_emails_path, "r") as f: data = f.readlines() if cfg.split_type == "random": random.seed(cfg.seed) @@ -107,11 +106,11 @@ def split_and_write_data(summarized_data_path,cfg): train_size = int(len(data) * 1-cfg.test_split) - with open(os.path.join(data_dir, "train.jsonl"), "w") as f: + with open(os.path.join(cfg.user.data_dir, "train.jsonl"), "w") as f: for i in range(train_size): f.write(data[i]) - with open(os.path.join(data_dir, "test.jsonl"), "w") as f: + with open(os.path.join(cfg.user.data_dir, "test.jsonl"), "w") as f: for i in range(train_size, len(data)): f.write(data[i]) @@ -122,12 +121,17 @@ def main(cfg: DictConfig) -> None: LOGGER.info("Running Panza Data Preparation") LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + # Skip running if summaries already exist check_if_file_exists(cfg) # Rename config keys to follow class structure rename_config_keys(cfg) + # Extract the emails from the .mbox file + extract_emails(cfg.email_dump_path, cfg.cleaned_emails_path, [cfg.user.email_address], cfg.discarded_emails_dir) + # Filter emails? + # Instantiate Panza writer writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter" @@ -138,17 +142,16 @@ def main(cfg: DictConfig) -> None: retriever.set_document_class(Email) # Load documents - documents = load_documents(cfg.data_path) - # TODO: Add custom resolver for output path and add it in config - output_path = cfg.data_path.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + documents = load_documents(cfg.cleaned_emails_path) generate_synthetic_instructions( - documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=output_path + documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=cfg.summarized_emails_path ) # Write the test data to test.jsonl, with an optional train-test split - split_and_write_data(output_path, cfg) + split_and_write_data(cfg) - create_vector_store(output_path, cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, os.path.dirname(cfg.data_path), cfg.user.username, cfg.rag_embedding_model) + # Use only the training data (which might be all the data) for RAG. + create_vector_store(os.path.join(cfg.user.data_dir, "train.jsonl"), cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, cfg.rag_db_dir, cfg.user.username, cfg.rag_embedding_model) if __name__ == "__main__": diff --git a/src/panza3/data_preparation/extract_emails.py b/src/panza3/data_preparation/extract_emails.py new file mode 100644 index 0000000..ef34543 --- /dev/null +++ b/src/panza3/data_preparation/extract_emails.py @@ -0,0 +1,185 @@ +import argparse +import json +import mailbox +import re +from email.utils import parsedate_to_datetime +from os import makedirs +from os.path import join, dirname + +import langdetect + +CLEAN_EMAILS = [] +DISCARDED_EMAILS = { + "non_english": [], + "forwarded": [], + "short": [], + "empty": [], + "cant_decode_utf8": [], +} + +SHORT_EMAIL_THRESHOLD = 10 # words + + +def extract_only_plain_text(msg_part): + if msg_part.get_content_type() == "text/plain": + body = msg_part.get_payload(decode=True) + plain_text = body.decode() # assuming the text is in UTF-8, handle other cases later + return plain_text + + +def skip_forwarded_messages(plain_text): + if "---------- Forwarded message ---------" in plain_text: + DISCARDED_EMAILS["forwarded"].append(plain_text) + return "" + else: + return plain_text + + +def remove_date_time(email_body): + # Regular expression pattern to match lines starting with "On " and ending with "> wrote: " + # The pattern uses non-greedy matching (.*?) to find the shortest match that satisfies the condition + pattern = re.compile(r"(^On.*wrote.*)|(^Am.*schrieb.*)", re.MULTILINE | re.DOTALL) + + match = pattern.search(email_body) + if match: + return (email_body[:match.start()] + email_body[match.end():]).strip() + else: + return email_body + + +def remove_lines_starting_with_gt(text): + lines = text.split("\n") + filtered_lines = [ + line for line in lines if not line.startswith(">") + ] # Filter out lines starting with "> " + return "\n".join(filtered_lines) + + +def count_words(s): + return len(s.split()) + + +def extract_by_quote_level(text): + # Split the text into lines + lines = text.split('\n') + + # Dictionary to store lines by quote level + grouped_lines = {} + + for line in lines: + # Count the number of '>' at the start of the line + quote_level = len(re.match(r'^>*', line).group()) + + # Remove leading '>' and spaces + clean_line = re.sub(r'^>*\s*', '', line) + + # Add the clean line to the appropriate group + if quote_level not in grouped_lines: + grouped_lines[quote_level] = [] + grouped_lines[quote_level].append(clean_line) + + return grouped_lines + + +def filter_message(msg): + try: + plain_text = extract_only_plain_text(msg) + except: + DISCARDED_EMAILS["cant_decode_utf8"].append(msg) + return None + + if plain_text is None: + return None + + plain_text = skip_forwarded_messages(plain_text) + email_with_thread = extract_by_quote_level(plain_text) + email_with_thread = ["\n".join(an_email).strip() for an_email in email_with_thread.values()] + + # remove "On ... wrote:" lines + email_with_thread = [remove_date_time(an_email) for an_email in email_with_thread] + + main_email = email_with_thread.pop(0) + email_with_thread.reverse() # chronological order + + # check length before detecting language + if count_words(main_email) < SHORT_EMAIL_THRESHOLD: + DISCARDED_EMAILS["short"].append(plain_text) + return None + try: + if langdetect.detect(main_email) != "en": + DISCARDED_EMAILS["non_english"].append(plain_text) + return None + except: + # failed to detect language + DISCARDED_EMAILS["non_english"].append(plain_text) + return None + + if main_email.isspace() or main_email == "": + DISCARDED_EMAILS["empty"].append(plain_text) + return None + + return (main_email.strip(), [an_email.strip() for an_email in email_with_thread]) + + +def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_emails_path): + + MBOX_PATH = mailbox_path + EMAIL = email_addresses + + mbox = mailbox.mbox(MBOX_PATH) + n_emails = len(mbox) + for i, message in enumerate(mbox): + print(f"--> processing {i}/{n_emails} <--") + # Filter messages sent from your email address + if message["from"] and any(email in message["from"] for email in EMAIL): + date = parsedate_to_datetime(message["Date"]).isoformat() + if message.is_multipart(): + for part in message.walk(): + filtered_msg = filter_message(part) + if filtered_msg is not None: + print(filtered_msg) + main_email, thread = filtered_msg + CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + else: + filtered_msg = filter_message(message) + if filtered_msg is not None: + print(filtered_msg) + main_email, thread = filtered_msg + CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + + print(f"\n---> [Cleaning stats] <---") + print(f"# clean emails = {len(CLEAN_EMAILS)}") + print( + f"# discarded emails:" + f"\n\t non_english = {len(DISCARDED_EMAILS['non_english'])}" + f"\n\t empty = {len(DISCARDED_EMAILS['empty'])}" + f"\n\t short (less than {SHORT_EMAIL_THRESHOLD} words)= {len(DISCARDED_EMAILS['short'])}" + f"\n\t forwarded = {len(DISCARDED_EMAILS['forwarded'])}" + f"\n\t cant_decode_utf8 = {len(DISCARDED_EMAILS['cant_decode_utf8'])}" + ) + + first_email = EMAIL[0] + username = first_email[: first_email.find("@")] + + makedirs(dirname(output_path), exist_ok=True) + + # Save clean emails + with open(join(output_path), "w", encoding="utf-8") as f: + for item in CLEAN_EMAILS: + json_record = json.dumps(item) + f.write(json_record + "\n") + + # Save discarded emails + if save_discarded_emails_path and save_discarded_emails_path != "": + makedirs(save_discarded_emails_path, exist_ok=True) + for k, v in DISCARDED_EMAILS.items(): + output_path = join( + save_discarded_emails_path, f"{username}_discarded_{k}.jsonl" + ) + with open(output_path, "w", encoding="utf-8") as f: + for i, item in enumerate(v): + print("\n\n\n\n\===========================") + print(item) + print("this is number", i, output_path) + json_record = json.dumps(item) + f.write(json_record + "\n") From 924033e7b38faf8958326cd51653a323f87c1d88 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 21 Oct 2024 12:48:37 +0200 Subject: [PATCH 088/112] update panza readme --- README_panza3.md | 62 ++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/README_panza3.md b/README_panza3.md index 0762c96..5eab314 100644 --- a/README_panza3.md +++ b/README_panza3.md @@ -64,16 +64,16 @@ The overall structure of Panza is as follows: 1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. 2. Create a new conda environment named 'panza' (or something else) and activate it: ``` bash -conda create -n panza python=3.10 +conda create -n panza python=3.10 -y conda activate panza ``` 3. Install the required packages: ``` bash -pip install panza_mail +pip install . ``` 4. If you want to also finetune models using Panza, you will need to install the additional packages: ``` bash -pip install panza_mail[training] +pip install .[training] ``` ## TODO: :rocket: Getting started @@ -109,16 +109,26 @@ At the end of this step you should have the downloaded emails placed inside `dat ### Step 1: Environment configuration -Panza is configured through a set of environment variables defined in `scripts/config.sh` and shared along all running scripts. +Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized unders the main functionalities of the code. +Note that these task-specific configs can, in some cases, be used to overrride base configs. + Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) - -The LLM prompt is controlled by a set of `prompt_preambles` that give the model more insight about its role, the user and how to reuse existing emails for *Retrieval-Augmented Generation (RAG)*. See more details in the [prompting section](prompt_preambles/README.md). +1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). +1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` +1. Serving: Serving consists of two parts - a serving infrastructre (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. + +These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: - - Modifiy the environment variable `PANZA_EMAIL_ADDRESS` inside `scripts/config.sh` with your own email address. - - Modifiy `prompt_preambles/user_preamble.txt` with your own information. If you choose, this can even be empty. +- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. +- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.) +- Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. + + +Additionally, please perform the following login steps to be able to download the base model. - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. - - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `PANZA_WANDB_DISABLED=False` in `scripts/config.sh`. + - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `wandb_disabled=false` in `configs/finetuning/base.yaml`. + You are now ready to move to `scripts`. ``` bash @@ -128,54 +138,40 @@ cd scripts ### Step 2: Extract emails -1. Run `./extract_emails.sh`. This extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - -2. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. - -### Step 3: Prepare dataset - - -1. Simply run `./prepare_dataset.sh`.
+1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
This scripts takes care of all the prerequisites before training (expand for details). + - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`.
-### Step 4: Train a LLM on your emails - +ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. + +### Step 3: Train a LLM on your emails + We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. 1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. -2. We have prepopulated the training scripts with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `LR`, `LORA_LR`, `NUM_EPOCHS`. All the trained models are saved in the `checkpoints` directory. +2. We have prepopulated the training configs with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `lr`, `lora_lr`, `num_epochs`. All the trained models are saved in the `checkpoints` directory. Examples: ``` bash ./train_rosa.sh # Will use the default parameters. -./train_rosa.sh LR=1e-6 LORA_LR=1e-6 NUM_EPOCHS=7 # Will override LR, LORA_LR, and NUM_EPOCHS. +./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. ``` ### Step 5: Launch Panza! -1. Run `./run_panza_gui.sh MODEL=` to serve the trained model in a friendly GUI. -Alternatively, if you prefer using the CLI to interact with Panza, run `./run_panza_cli.sh` instead. - -You can experiment with the following arguments: -- If `MODEL` is not specified, it will use a pretrained `Meta-Llama-3-8B-Instruct` model by default, although Panza also works with `Mistral-7B-Instruct-v2`. Try it out to compare the syle difference! -- To disable RAG, run with `PANZA_DISABLE_RAG_INFERENCE=1`. +- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. +- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? -Example: -``` bash -./run_panza_gui.sh \ - MODEL=/local/path/to/this/repo/checkpoints/models/panza-rosa_1e-6-seed42_7908 \ - PANZA_DISABLE_RAG_INFERENCE=0 # this is the default behaviour, so you can omit it -``` :email: **Have fun with your new email writing assistant!** :email: From ea5ffeae2482a22603d44cc0f222ca87bc193011 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 21 Oct 2024 13:02:40 +0200 Subject: [PATCH 089/112] update env preparation script --- prepare_env.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prepare_env.sh b/prepare_env.sh index c730020..61e106b 100644 --- a/prepare_env.sh +++ b/prepare_env.sh @@ -1,8 +1,8 @@ # crash in case of error trap 'trap - ERR RETURN; kill -INT $$ ; return' ERR RETURN -conda create --name panza python=3.10 -y -conda activate panza +conda create --name panza_refactor python=3.10 -y +conda activate panza_refactor # install dependencies based on pyproject.toml -pip install -e . \ No newline at end of file +pip install -e .[training] \ No newline at end of file From ca4b690d8fd88c7f3a1e4307061e581eb3dcafd8 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 28 Oct 2024 12:48:36 +0100 Subject: [PATCH 090/112] slight refactor of runner.py --- scripts/runner.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scripts/runner.py b/scripts/runner.py index 0d5137f..303dd6a 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -23,13 +23,16 @@ def rename_config_keys(cfg: DictConfig) -> None: # Re-enable struct mode to lock down the configuration OmegaConf.set_struct(cfg, True) - -def get_latest_model(cfg: DictConfig) -> None: + + +def set_latest_model(cfg: DictConfig) -> None: model_files = glob.glob(f'{cfg.checkpoint_dir}/models/*') # * means all if need specific format then *.csv latest_file = max(model_files, key=os.path.getctime) - return latest_file - + OmegaConf.set_struct(cfg, False) + cfg.checkpoint = latest_file + OmegaConf.set_struct(cfg, True) + @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_writer") def main(cfg: DictConfig) -> None: @@ -38,10 +41,9 @@ def main(cfg: DictConfig) -> None: # Rename config keys to follow class structure - # Find the latest checkpoint, if requested. rename_config_keys(cfg) - if cfg.checkpoint == "latest": - cfg.checkpoint = get_latest_model(cfg) + # Find the latest checkpoint, if requested. + set_latest_model(cfg) # Instantiate Panza writer writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) From 233083ecd97742bc7fcade7ebc173db9aea82cb3 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 28 Oct 2024 13:09:19 +0100 Subject: [PATCH 091/112] remove some unused .sh files --- scripts/config.sh | 59 --------------------------------------- scripts/extract_emails.sh | 11 -------- 2 files changed, 70 deletions(-) delete mode 100755 scripts/config.sh delete mode 100755 scripts/extract_emails.sh diff --git a/scripts/config.sh b/scripts/config.sh deleted file mode 100755 index 08b1479..0000000 --- a/scripts/config.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -export PANZA_EMAIL_ADDRESS="firstname.lastname@gmail.com" # Change this to your email address! -export PANZA_USERNAME="${PANZA_EMAIL_ADDRESS%@*}" # Removes everything after @; for the example above, it will be firstname.lastname - -export PANZA_WORKSPACE=$(dirname "$(dirname "$(realpath "$0")")"); -export PANZA_DATA_DIR="$PANZA_WORKSPACE/data" # where data is stored -export PANZA_CHECKPOINTS="$PANZA_WORKSPACE/checkpoints" # where checkpoints are stored -export PANZA_FINETUNE_CONFIGS="$PANZA_WORKSPACE/src/panza/finetuning/configs" # where training configuration details are stored - -export PANZA_PREAMBLES="$PANZA_WORKSPACE/prompt_preambles" # this is where the system prompt and user prompt preambles can be accessed; you will need to edit these -export PANZA_SYSTEM_PREAMBLE_PATH="$PANZA_PREAMBLES/system_preamble.txt" # system prompt -# IMPORTANT: Please edit the user preamble (at the PANZA_USER_PREAMBLE_PATH) if you plan to use it (recommended). -export PANZA_USER_PREAMBLE_PATH="$PANZA_PREAMBLES/user_preamble.txt" # a useful preamble to the user instruction, explaining what's going on to the LLM -export PANZA_RAG_PREAMBLE_PATH="$PANZA_PREAMBLES/rag_preamble.txt" # a preamble for the RAG component -export PANZA_THREAD_PREAMBLE_PATH="$PANZA_PREAMBLES/thread_preamble.txt" # a preamble for the RAG component - -export PANZA_SUMMARIZATION_BATCH_SIZE=8 # batch size for summarization. -export PANZA_EVALUATION_BATCH_SIZE=1 # batch size for evaluation. Can safely be set to higher value (e.g., 8) if the GPU has enough capacity. - -export MODEL_PRECISION=bf16 # precision at which the base model is stored; options: bf16, fp32, or '4bit' -# export PANZA_GENERATIVE_MODEL="mistralai/Mistral-7B-Instruct-v0.2" -export PANZA_GENERATIVE_MODEL="ISTA-DASLab/Meta-Llama-3-8B-Instruct" -# export PANZA_GENERATIVE_MODEL="microsoft/Phi-3-mini-4k-instruct" - -lowercased=$(echo "$PANZA_GENERATIVE_MODEL" | tr '[:upper:]' '[:lower:]') -if [[ ${lowercased} == *llama* ]]; then - export MODEL_TYPE=llama3 -elif [[ ${lowercased} == *mistral* ]]; then - export MODEL_TYPE=mistralv2 -elif [[ ${lowercased} == *phi* ]]; then - export MODEL_TYPE=phi3 -else - echo "Model type ${PANZA_GENERATIVE_MODEL} not recognized! Panza only works with Mistral and Llama3 models. Exiting." - exit -fi - -export PANZA_EMBEDDING_MODEL="sentence-transformers/all-mpnet-base-v2" # embedding model for RAG; can be changed, trading off speed for quality - -export PANZA_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG - -export PANZA_SEED=42 # the one true seed - -export PANZA_FINETUNE_WITH_PREAMBLE=1 # states whether user and system preambles are used for fine-tuning; on by default -export PANZA_FINETUNE_WITH_RAG=0 # states whether RAG preambles are used for fine-tuning; off by default -export PANZA_FINETUNE_WITH_THREAD=0 # states whether the email thread is used for fine-tuning; off by default -export PANZA_FINETUNE_RAG_NUM_EMAILS=3 # maximum number of emails to use for RAG fine-tuning; 3 by default -export PANZA_FINETUNE_RAG_PROB=0.55 # probability of using RAG context for fine-tuning; 0.5 by default -export PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG during fine-tuning -export PANZA_FINETUNE_THREAD_NUM_EMAILS=3 # maximum number of emails to use for thread fine-tuning; 3 by default -export PANZA_DISABLE_RAG_INFERENCE=0 # RAG inference is on by default, since it's usually better - -export PANZA_WANDB_DISABLED=True # disable Weights and Biases logging by default - -export PYTHONPATH="$PANZA_WORKSPACE/src:$PYTHONPATH" - -# Optionally, set your HF_HOME and/or TRANSFORMERS_CACHE here. -# export HF_HOME= -# export TRANSFORMERS_CACHE= diff --git a/scripts/extract_emails.sh b/scripts/extract_emails.sh deleted file mode 100755 index 23f5300..0000000 --- a/scripts/extract_emails.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -source config.sh - -MBOX_NAME="Sent.mbox" -MBOX_PATH="${PANZA_DATA_DIR}/${MBOX_NAME}" - -python ../src/panza/data_preparation/extract_emails.py \ - --mbox-path=${MBOX_PATH} \ - --output-path=${PANZA_DATA_DIR} \ - --email=${PANZA_EMAIL_ADDRESS} \ \ No newline at end of file From 6f943792784bcc033fc1a4c062505325d2d8d23f Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 28 Oct 2024 14:11:18 +0100 Subject: [PATCH 092/112] qq [Ad[DxxRevert "remove some unused .sh files" This reverts commit 233083ecd97742bc7fcade7ebc173db9aea82cb3. --- scripts/config.sh | 59 +++++++++++++++++++++++++++++++++++++++ scripts/extract_emails.sh | 11 ++++++++ 2 files changed, 70 insertions(+) create mode 100755 scripts/config.sh create mode 100755 scripts/extract_emails.sh diff --git a/scripts/config.sh b/scripts/config.sh new file mode 100755 index 0000000..08b1479 --- /dev/null +++ b/scripts/config.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +export PANZA_EMAIL_ADDRESS="firstname.lastname@gmail.com" # Change this to your email address! +export PANZA_USERNAME="${PANZA_EMAIL_ADDRESS%@*}" # Removes everything after @; for the example above, it will be firstname.lastname + +export PANZA_WORKSPACE=$(dirname "$(dirname "$(realpath "$0")")"); +export PANZA_DATA_DIR="$PANZA_WORKSPACE/data" # where data is stored +export PANZA_CHECKPOINTS="$PANZA_WORKSPACE/checkpoints" # where checkpoints are stored +export PANZA_FINETUNE_CONFIGS="$PANZA_WORKSPACE/src/panza/finetuning/configs" # where training configuration details are stored + +export PANZA_PREAMBLES="$PANZA_WORKSPACE/prompt_preambles" # this is where the system prompt and user prompt preambles can be accessed; you will need to edit these +export PANZA_SYSTEM_PREAMBLE_PATH="$PANZA_PREAMBLES/system_preamble.txt" # system prompt +# IMPORTANT: Please edit the user preamble (at the PANZA_USER_PREAMBLE_PATH) if you plan to use it (recommended). +export PANZA_USER_PREAMBLE_PATH="$PANZA_PREAMBLES/user_preamble.txt" # a useful preamble to the user instruction, explaining what's going on to the LLM +export PANZA_RAG_PREAMBLE_PATH="$PANZA_PREAMBLES/rag_preamble.txt" # a preamble for the RAG component +export PANZA_THREAD_PREAMBLE_PATH="$PANZA_PREAMBLES/thread_preamble.txt" # a preamble for the RAG component + +export PANZA_SUMMARIZATION_BATCH_SIZE=8 # batch size for summarization. +export PANZA_EVALUATION_BATCH_SIZE=1 # batch size for evaluation. Can safely be set to higher value (e.g., 8) if the GPU has enough capacity. + +export MODEL_PRECISION=bf16 # precision at which the base model is stored; options: bf16, fp32, or '4bit' +# export PANZA_GENERATIVE_MODEL="mistralai/Mistral-7B-Instruct-v0.2" +export PANZA_GENERATIVE_MODEL="ISTA-DASLab/Meta-Llama-3-8B-Instruct" +# export PANZA_GENERATIVE_MODEL="microsoft/Phi-3-mini-4k-instruct" + +lowercased=$(echo "$PANZA_GENERATIVE_MODEL" | tr '[:upper:]' '[:lower:]') +if [[ ${lowercased} == *llama* ]]; then + export MODEL_TYPE=llama3 +elif [[ ${lowercased} == *mistral* ]]; then + export MODEL_TYPE=mistralv2 +elif [[ ${lowercased} == *phi* ]]; then + export MODEL_TYPE=phi3 +else + echo "Model type ${PANZA_GENERATIVE_MODEL} not recognized! Panza only works with Mistral and Llama3 models. Exiting." + exit +fi + +export PANZA_EMBEDDING_MODEL="sentence-transformers/all-mpnet-base-v2" # embedding model for RAG; can be changed, trading off speed for quality + +export PANZA_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG + +export PANZA_SEED=42 # the one true seed + +export PANZA_FINETUNE_WITH_PREAMBLE=1 # states whether user and system preambles are used for fine-tuning; on by default +export PANZA_FINETUNE_WITH_RAG=0 # states whether RAG preambles are used for fine-tuning; off by default +export PANZA_FINETUNE_WITH_THREAD=0 # states whether the email thread is used for fine-tuning; off by default +export PANZA_FINETUNE_RAG_NUM_EMAILS=3 # maximum number of emails to use for RAG fine-tuning; 3 by default +export PANZA_FINETUNE_RAG_PROB=0.55 # probability of using RAG context for fine-tuning; 0.5 by default +export PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG during fine-tuning +export PANZA_FINETUNE_THREAD_NUM_EMAILS=3 # maximum number of emails to use for thread fine-tuning; 3 by default +export PANZA_DISABLE_RAG_INFERENCE=0 # RAG inference is on by default, since it's usually better + +export PANZA_WANDB_DISABLED=True # disable Weights and Biases logging by default + +export PYTHONPATH="$PANZA_WORKSPACE/src:$PYTHONPATH" + +# Optionally, set your HF_HOME and/or TRANSFORMERS_CACHE here. +# export HF_HOME= +# export TRANSFORMERS_CACHE= diff --git a/scripts/extract_emails.sh b/scripts/extract_emails.sh new file mode 100755 index 0000000..23f5300 --- /dev/null +++ b/scripts/extract_emails.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +source config.sh + +MBOX_NAME="Sent.mbox" +MBOX_PATH="${PANZA_DATA_DIR}/${MBOX_NAME}" + +python ../src/panza/data_preparation/extract_emails.py \ + --mbox-path=${MBOX_PATH} \ + --output-path=${PANZA_DATA_DIR} \ + --email=${PANZA_EMAIL_ADDRESS} \ \ No newline at end of file From 8ce3c4a38313177c6a165ed0e46f3541a65f2e98 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 29 Oct 2024 14:43:04 +0100 Subject: [PATCH 093/112] make the first part of the data preparation script optional (in case of manual modifications --- scripts/prepare_data.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index d650cff..a2fa9e1 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -80,12 +80,13 @@ def generate_synthetic_instructions( def check_if_file_exists(cfg: DictConfig) -> None: - if os.path.exists(cfg.cleaned_emails_path) and not cfg.force: + if os.path.exists(cfg.cleaned_emails_path) and not cfg.force_extract_clean_emails: LOGGER.warning( - "Summaries already exists, program will close. " - "If you want to regenerate use the flag force=true." + f"Cleaned email file already exists, using existing file {cfg.cleaned_emails_path}. " + "If you want to regenerate use the flag force_extract_clean_emails=true." ) - sys.exit(0) + return True + return False def split_and_write_data(cfg): @@ -121,16 +122,13 @@ def main(cfg: DictConfig) -> None: LOGGER.info("Running Panza Data Preparation") LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) - - # Skip running if summaries already exist - check_if_file_exists(cfg) - # Rename config keys to follow class structure rename_config_keys(cfg) - # Extract the emails from the .mbox file - extract_emails(cfg.email_dump_path, cfg.cleaned_emails_path, [cfg.user.email_address], cfg.discarded_emails_dir) - # Filter emails? + # Skip running if already exist + if not check_if_file_exists(cfg): + # Extract the emails from the .mbox file + extract_emails(cfg.email_dump_path, cfg.cleaned_emails_path, [cfg.user.email_address], cfg.discarded_emails_dir) # Instantiate Panza writer writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) From c1f36a8bd0f2170e09653a026e527738b23cb7af Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Thu, 31 Oct 2024 13:01:47 +0100 Subject: [PATCH 094/112] Edits and Bug Fixes - Added bug fix for error encountered in json dumps for Message and mboxMessage objects - Added clarification for email and username reqs - Changed wanbd_disabled default to true to track with README --- README_panza3.md | 8 ++--- configs/finetuning/base.yaml | 2 +- configs/user/default.yaml | 4 +-- src/panza3/data_preparation/extract_emails.py | 36 +++++++++++++------ 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/README_panza3.md b/README_panza3.md index 5eab314..1027732 100644 --- a/README_panza3.md +++ b/README_panza3.md @@ -109,19 +109,19 @@ At the end of this step you should have the downloaded emails placed inside `dat ### Step 1: Environment configuration -Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized unders the main functionalities of the code. -Note that these task-specific configs can, in some cases, be used to overrride base configs. +Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized under the main functionalities of the code. +Note that these task-specific configs can, in some cases, be used to override base configs. Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) 1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). 1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` -1. Serving: Serving consists of two parts - a serving infrastructre (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. +1. Serving: Serving consists of two parts - a serving infrastructure (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: - Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. -- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.) +- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. - Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. diff --git a/configs/finetuning/base.yaml b/configs/finetuning/base.yaml index ee5b3d8..cf594fd 100644 --- a/configs/finetuning/base.yaml +++ b/configs/finetuning/base.yaml @@ -1,4 +1,4 @@ -wandb_disabled: false +wandb_disabled: true # We assume that wandb is disabled unless the user has logged on. max_seq_len: 512 global_seed: ${seed} diff --git a/configs/user/default.yaml b/configs/user/default.yaml index 853692c..4b36513 100644 --- a/configs/user/default.yaml +++ b/configs/user/default.yaml @@ -1,5 +1,5 @@ -email_address: "firstname.lastname@gmail.com" # Change this to your email address! -username: "firstname.lastname" # TODO(armand): Use custom resolver to extract username from email address. +email_address: "abc@xyz.com" # Change this to your email address! +username: "abc" # TODO(armand): Use custom resolver to extract username from email address. data_dir: ${panza_workspace}/data diff --git a/src/panza3/data_preparation/extract_emails.py b/src/panza3/data_preparation/extract_emails.py index ef34543..4739d09 100644 --- a/src/panza3/data_preparation/extract_emails.py +++ b/src/panza3/data_preparation/extract_emails.py @@ -3,6 +3,8 @@ import mailbox import re from email.utils import parsedate_to_datetime +from email.message import Message +from mailbox import mboxMessage from os import makedirs from os.path import join, dirname @@ -42,7 +44,7 @@ def remove_date_time(email_body): match = pattern.search(email_body) if match: - return (email_body[:match.start()] + email_body[match.end():]).strip() + return (email_body[: match.start()] + email_body[match.end() :]).strip() else: return email_body @@ -61,17 +63,17 @@ def count_words(s): def extract_by_quote_level(text): # Split the text into lines - lines = text.split('\n') + lines = text.split("\n") # Dictionary to store lines by quote level grouped_lines = {} for line in lines: # Count the number of '>' at the start of the line - quote_level = len(re.match(r'^>*', line).group()) + quote_level = len(re.match(r"^>*", line).group()) # Remove leading '>' and spaces - clean_line = re.sub(r'^>*\s*', '', line) + clean_line = re.sub(r"^>*\s*", "", line) # Add the clean line to the appropriate group if quote_level not in grouped_lines: @@ -99,7 +101,7 @@ def filter_message(msg): email_with_thread = [remove_date_time(an_email) for an_email in email_with_thread] main_email = email_with_thread.pop(0) - email_with_thread.reverse() # chronological order + email_with_thread.reverse() # chronological order # check length before detecting language if count_words(main_email) < SHORT_EMAIL_THRESHOLD: @@ -139,13 +141,27 @@ def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_em if filtered_msg is not None: print(filtered_msg) main_email, thread = filtered_msg - CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + CLEAN_EMAILS.append( + { + "email": main_email, + "thread": thread, + "subject": message["Subject"], + "date": date, + } + ) else: filtered_msg = filter_message(message) if filtered_msg is not None: print(filtered_msg) main_email, thread = filtered_msg - CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + CLEAN_EMAILS.append( + { + "email": main_email, + "thread": thread, + "subject": message["Subject"], + "date": date, + } + ) print(f"\n---> [Cleaning stats] <---") print(f"# clean emails = {len(CLEAN_EMAILS)}") @@ -173,12 +189,12 @@ def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_em if save_discarded_emails_path and save_discarded_emails_path != "": makedirs(save_discarded_emails_path, exist_ok=True) for k, v in DISCARDED_EMAILS.items(): - output_path = join( - save_discarded_emails_path, f"{username}_discarded_{k}.jsonl" - ) + output_path = join(save_discarded_emails_path, f"{username}_discarded_{k}.jsonl") with open(output_path, "w", encoding="utf-8") as f: for i, item in enumerate(v): print("\n\n\n\n\===========================") + if type(item) is Message or type(item) is mboxMessage: + item = item.get_payload() print(item) print("this is number", i, output_path) json_record = json.dumps(item) From bacbdf7fc47244bb353b39db914b927c98ffa3d1 Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Mon, 4 Nov 2024 10:40:28 +0100 Subject: [PATCH 095/112] Add additional clarification on username importance --- README_panza3.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/README_panza3.md b/README_panza3.md index 1027732..787a777 100644 --- a/README_panza3.md +++ b/README_panza3.md @@ -120,8 +120,8 @@ Note that these task-specific configs can, in some cases, be used to override ba These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: -- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. -- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. +- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. +- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. - Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. @@ -165,6 +165,17 @@ Examples: ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. ``` +
+ FAQs. + The bash scripts that are used to execute the finetuning procedure assume by default that your username is what is returned by the whoami command. This is used to locate the name of the user configs inside the configs/user directory as above. If you directly modified default.yaml, or created another yaml file where the name of that file does not match with the output of whoami, there will be an error. This is an easy fix. You can either: +
    +
  1. Change the name of the yaml file to be the output of whoami. +
  2. You can override the username manually when you launch the bash script by adding user=x where x is the name of the yaml file you created. For example: ./train_rosa.sh user=alonso +
+
+ If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. +
+ ### Step 5: Launch Panza! From 6be5813ba3c1d13fbf6cdac7fc7f936c9e0210f6 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 4 Nov 2024 14:29:32 +0100 Subject: [PATCH 096/112] update data preparation --- configs/base.yaml | 4 ++-- configs/finetuning/rosa.yaml | 2 +- configs/panza_preparation.yaml | 2 +- configs/user/default.yaml | 2 +- configs/writer/llm/peft.yaml | 2 +- configs/writer/llm/transformers.yaml | 2 +- configs/writer/prompting/email_prompting.yaml | 4 ++-- prompt_preambles/user_preamble.txt | 7 ++----- scripts/run_services.sh | 7 ++++--- src/panza/evaluation/base_inference.py | 2 ++ src/panza/evaluation/service_inference.py | 10 ++++++---- 11 files changed, 23 insertions(+), 21 deletions(-) diff --git a/configs/base.yaml b/configs/base.yaml index 0c5d85e..0a09b04 100644 --- a/configs/base.yaml +++ b/configs/base.yaml @@ -3,7 +3,7 @@ defaults: panza_workspace: ${hydra:runtime.cwd}/../ checkpoint_dir: ${panza_workspace}/checkpoints -seed: 42 +seed: 41 embedding_model: "sentence-transformers/all-mpnet-base-v2" -model_precision: bf16 # bf16 or fp32 \ No newline at end of file +model_precision: bf16 # bf16 or fp32 diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index 6abaa4f..d896445 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -29,4 +29,4 @@ rosa: scheduler: t_warmup: 8ba -num_cpu_threads: 0 \ No newline at end of file +num_cpu_threads: 0 diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml index 449df89..72134fa 100644 --- a/configs/panza_preparation.yaml +++ b/configs/panza_preparation.yaml @@ -13,7 +13,7 @@ summarized_emails_path: ${user.data_dir}/${user.username}_emails_clean_summarize rag_db_dir: ${user.data_dir} checkpoint: "microsoft/Phi-3-mini-4k-instruct" -force: false # If false, data will not be recreated if it already exists. +force_extract_clean_emails: false # If false, data will not be recreated if it already exists. # Parameters for train-test split, if required. test_split: 0. diff --git a/configs/user/default.yaml b/configs/user/default.yaml index 4b36513..0581579 100644 --- a/configs/user/default.yaml +++ b/configs/user/default.yaml @@ -6,4 +6,4 @@ data_dir: ${panza_workspace}/data system_preamble_path: ${panza_workspace}/prompt_preambles/system_preamble.txt user_preamble_path: ${panza_workspace}/prompt_preambles/user_preamble.txt rag_preamble_path: ${panza_workspace}/prompt_preambles/rag_preamble.txt -thread_preamble_path: ${panza_workspace}/prompt_preambles/thread_preamble.txt \ No newline at end of file +thread_preamble_path: ${panza_workspace}/prompt_preambles/thread_preamble.txt diff --git a/configs/writer/llm/peft.yaml b/configs/writer/llm/peft.yaml index 8a97b5e..fcae93a 100644 --- a/configs/writer/llm/peft.yaml +++ b/configs/writer/llm/peft.yaml @@ -4,6 +4,6 @@ defaults: _target_: panza3.llm.PeftLLM name: ${checkpoint} checkpoint: ${checkpoint} -device: "cpu" # Alternatively, "cuda" +device: "cuda" # Alternatively, "cuda" dtype: "fp32" load_in_4bit: false diff --git a/configs/writer/llm/transformers.yaml b/configs/writer/llm/transformers.yaml index f872cf0..ddfb3c6 100644 --- a/configs/writer/llm/transformers.yaml +++ b/configs/writer/llm/transformers.yaml @@ -4,6 +4,6 @@ defaults: _target_: panza3.llm.TransformersLLM name: ${checkpoint} checkpoint: ${checkpoint} -device: "cpu" +device: "cuda" dtype: "fp32" load_in_4bit: false \ No newline at end of file diff --git a/configs/writer/prompting/email_prompting.yaml b/configs/writer/prompting/email_prompting.yaml index 6dc5631..235fc3e 100644 --- a/configs/writer/prompting/email_prompting.yaml +++ b/configs/writer/prompting/email_prompting.yaml @@ -8,6 +8,6 @@ user_preamble: ${load_user_preamble:${user.user_preamble_path}} rag_preamble: ${load_preamble:${user.rag_preamble_path}} thread_preamble: ${load_preamble:${user.thread_preamble_path}} -number_rag_emails: 3 +number_rag_emails: 0 rag_relevance_threshold: 0.2 -number_thread_emails: 3 \ No newline at end of file +number_thread_emails: 0 \ No newline at end of file diff --git a/prompt_preambles/user_preamble.txt b/prompt_preambles/user_preamble.txt index 67b0385..a68751a 100644 --- a/prompt_preambles/user_preamble.txt +++ b/prompt_preambles/user_preamble.txt @@ -2,8 +2,5 @@ # better help Panza write on their behalf. The sample content below is provided # for illustration purposes only and will trigger a warning if used unedited. -[CHANGE ME] My name is Jane Doe. I work as a manager at Acme Corp. -My address is 123 Main Street, Springfield, IL, USA. -My boss's name is Alex Burns. My children's names are Elsa, Anna, and Olaf. -I am deeply committed to my hobby of underwater basket weaving, for which -we meet every Thursday at noon. +My name is Jen Iofinova. + diff --git a/scripts/run_services.sh b/scripts/run_services.sh index 0f7122a..d054232 100755 --- a/scripts/run_services.sh +++ b/scripts/run_services.sh @@ -3,9 +3,11 @@ source config.sh MODEL="../checkpoints/models/panza_seanyang711_llama3_bf16-bs8-rosa_wl16_d0.01_1grads_mean_squared_r8_loralr1e-5_alpha16-lr1e-5-epochs5-wu8-seed42-PREAMBLE-16296" +MODEL="../checkpoints/models/panza_llama3_bf16-bs8-rosa_wl16_d0_1grads_mean_squared_r8_loralr1e-5_alpha16-lr1e-5-epochs5-wu8-seed42-PREAMBLE-31921" +MODEL="../checkpoints/models/panza_jen.iofinova-Meta-Llama-3-8B-Instruct-bf16-bs8-fft-lr1e-05-3ep-seed41" DEVICE="cuda:1" -DTYPE="bf16" +DTYPE="auto" for ARGUMENT in "$@" do @@ -32,5 +34,4 @@ python ${INFERENCE_SCRIPT} \ --db-path=${PANZA_DATA_DIR} \ --index-name=${PANZA_USERNAME} \ --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} \ - ${USE_4BIT_QUANT} \ No newline at end of file + ${USE_4BIT_QUANT} diff --git a/src/panza/evaluation/base_inference.py b/src/panza/evaluation/base_inference.py index 744f635..8f421e9 100644 --- a/src/panza/evaluation/base_inference.py +++ b/src/panza/evaluation/base_inference.py @@ -8,6 +8,8 @@ from panza.utils import prompting from panza.utils.documents import Email +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import AutoPeftModelForCausalLM sys.path.pop(0) diff --git a/src/panza/evaluation/service_inference.py b/src/panza/evaluation/service_inference.py index 80ae230..9dc452d 100644 --- a/src/panza/evaluation/service_inference.py +++ b/src/panza/evaluation/service_inference.py @@ -56,9 +56,11 @@ class Response(BaseModel): embeddings_model = rag.get_embeddings_model(args.embedding_model) db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) -system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble -) +#system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( +# args.system_preamble, args.user_preamble, args.rag_preamble +#) + +system_preamble, user_preamble, rag_preamble = ("", "", "") def predict(user_input): prompts, outputs = base_inference.run_inference( @@ -98,4 +100,4 @@ def generate_text(request: Request, x_api_key: Annotated[str | None, Header()] = if __name__ == '__main__': - uvicorn.run(app, host='0.0.0.0', port=5000) \ No newline at end of file + uvicorn.run(app, host='0.0.0.0', port=5000) From 302bae61b993b63c9c57def5f3a5fc53a48ddbb9 Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Fri, 8 Nov 2024 17:57:51 +0100 Subject: [PATCH 097/112] Miscellanous updates This commit features a series of updates. 1. Introduction of formatting with Black added through a precommit that contributers should install. Instructions to do so have been added so that if PRs are created, all code is in same formatting. 2. Formatting code with Black. 3. Removal of debug print statements. 4. Addressing bug with n_proc > in datasets.map with HF --- .pre-commit-config.yaml | 6 + README_panza3.md | 35 +- configs/finetuning/rosa.yaml | 2 +- pyproject.toml | 3 + scripts/prepare_data.py | 25 +- scripts/prepare_data.sh | 31 + scripts/runner.py | 10 +- scripts/runner.sh | 31 + src/panza3/data_preparation/extract_emails.py | 17 +- src/panza3/data_preparation/rag.py | 13 +- src/panza3/entities/document.py | 4 +- src/panza3/finetuning/preprocessing.py | 6 +- src/panza3/finetuning/train.py | 793 ++++++++---------- src/panza3/interface/json.py | 142 ++-- 14 files changed, 586 insertions(+), 532 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100755 scripts/prepare_data.sh create mode 100755 scripts/runner.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9366673 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + language_version: python3.10 \ No newline at end of file diff --git a/README_panza3.md b/README_panza3.md index 787a777..38a9dcd 100644 --- a/README_panza3.md +++ b/README_panza3.md @@ -120,7 +120,7 @@ Note that these task-specific configs can, in some cases, be used to override ba These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: -- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. +- Copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. If you modify the default yaml, you will need specify `user=default` as an extra flag in the succeeding steps. - In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. - Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. @@ -146,6 +146,16 @@ cd scripts - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`.
+**NB**: if you did not change the default configuration in `user/default.yaml` to reflect your particulars but rather created a new file, you need to add the additional flag to the above command where you specify `user=x` where your config file was named `x.yaml`. + +
+ FAQs. + When running the above script, you may encounter an OutOfMemoryError. If this is the case, you can either: +
    +
  1. Reduce the batch size for the data processing step. This can be found in configs/panza_preparation.yaml. +
  2. Move to a machine that has more memory. +
+
ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. @@ -161,9 +171,9 @@ If a larger GPU is available and full-parameter fine-tuning is possible, run `./ Examples: ``` bash -./train_rosa.sh # Will use the default parameters. +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh # Will use the default parameters. -./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. ```
FAQs. @@ -174,6 +184,8 @@ Examples:
If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. +

+ A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's load_datasets where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml.
@@ -194,6 +206,23 @@ Examples: - [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) - [Prompt Preambles Tutorial](prompt_preambles/README.md) +## :woman_technologist: Contributing +If you liked our work and want to contribute to improve the system, please feel free to do so! Make a _fork_ of our repository and once you have made your changes, submit a pull request so that we can review! + +One thing to mention: we want to make sure that we all adhere to the same coding standards, so we have added Black, a code formatter, as a prehook. To ensure that all your files are formatted with Black, do the following: + +1. Install the necessary dependencies +``` +pip install .[contributing] +``` + +2. Run the precommit command +``` +pre-commit install +``` + +3. Continue adding code as usual. All your code will be formatted by Black before commiting! + ## Authors Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): diff --git a/configs/finetuning/rosa.yaml b/configs/finetuning/rosa.yaml index d896445..3f614ec 100644 --- a/configs/finetuning/rosa.yaml +++ b/configs/finetuning/rosa.yaml @@ -29,4 +29,4 @@ rosa: scheduler: t_warmup: 8ba -num_cpu_threads: 0 +num_cpu_threads: 1 diff --git a/pyproject.toml b/pyproject.toml index fec5c2e..e7d70a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,9 @@ training = [ "peft@git+https://github.com/IST-DASLab/peft-rosa.git@grad_quant_looser_versioning", "spops-sm-80", ] +contributing = [ + "pre-commit", +] [build-system] requires = ["setuptools >= 61.0.0"] diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index a2fa9e1..1b94e39 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -4,7 +4,6 @@ import os import random import shutil -import sys import time from typing import List @@ -105,7 +104,7 @@ def split_and_write_data(cfg): else: raise ValueError("Invalid split type.") - train_size = int(len(data) * 1-cfg.test_split) + train_size = int(len(data) * 1 - cfg.test_split) with open(os.path.join(cfg.user.data_dir, "train.jsonl"), "w") as f: for i in range(train_size): @@ -116,7 +115,6 @@ def split_and_write_data(cfg): f.write(data[i]) - @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_preparation") def main(cfg: DictConfig) -> None: LOGGER.info("Running Panza Data Preparation") @@ -128,7 +126,12 @@ def main(cfg: DictConfig) -> None: # Skip running if already exist if not check_if_file_exists(cfg): # Extract the emails from the .mbox file - extract_emails(cfg.email_dump_path, cfg.cleaned_emails_path, [cfg.user.email_address], cfg.discarded_emails_dir) + extract_emails( + cfg.email_dump_path, + cfg.cleaned_emails_path, + [cfg.user.email_address], + cfg.discarded_emails_dir, + ) # Instantiate Panza writer writer: PanzaWriter = hydra.utils.instantiate(cfg.writer) @@ -142,14 +145,24 @@ def main(cfg: DictConfig) -> None: # Load documents documents = load_documents(cfg.cleaned_emails_path) generate_synthetic_instructions( - documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=cfg.summarized_emails_path + documents=documents, + writer=writer, + batch_size=cfg.batch_size, + output_path=cfg.summarized_emails_path, ) # Write the test data to test.jsonl, with an optional train-test split split_and_write_data(cfg) # Use only the training data (which might be all the data) for RAG. - create_vector_store(os.path.join(cfg.user.data_dir, "train.jsonl"), cfg.rag_embedding_chunk_size, cfg.rag_embedding_chunk_overlap, cfg.rag_db_dir, cfg.user.username, cfg.rag_embedding_model) + create_vector_store( + os.path.join(cfg.user.data_dir, "train.jsonl"), + cfg.rag_embedding_chunk_size, + cfg.rag_embedding_chunk_overlap, + cfg.rag_db_dir, + cfg.user.username, + cfg.rag_embedding_model, + ) if __name__ == "__main__": diff --git a/scripts/prepare_data.sh b/scripts/prepare_data.sh new file mode 100755 index 0000000..0207732 --- /dev/null +++ b/scripts/prepare_data.sh @@ -0,0 +1,31 @@ +# Convenience script for data preparation +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# CUDA_VISIBLE_DEVICES=x ./prepare_data.sh user=alonso + +set -e + +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + else + vars[idx]=$argument + idx+=1 + fi +done + +python ./prepare_data.py ${vars[@]} \ No newline at end of file diff --git a/scripts/runner.py b/scripts/runner.py index 303dd6a..e0728a3 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -6,7 +6,6 @@ from omegaconf import DictConfig, OmegaConf from panza3 import PanzaWriter # The import also loads custom Hydra resolvers -from panza3.entities import EmailInstruction LOGGER = logging.getLogger(__name__) @@ -26,13 +25,15 @@ def rename_config_keys(cfg: DictConfig) -> None: def set_latest_model(cfg: DictConfig) -> None: - model_files = glob.glob(f'{cfg.checkpoint_dir}/models/*') # * means all if need specific format then *.csv + model_files = glob.glob( + f"{cfg.checkpoint_dir}/models/*" + ) # * means all if need specific format then *.csv latest_file = max(model_files, key=os.path.getctime) - + OmegaConf.set_struct(cfg, False) cfg.checkpoint = latest_file OmegaConf.set_struct(cfg, True) - + @hydra.main(version_base="1.1", config_path="../configs", config_name="panza_writer") def main(cfg: DictConfig) -> None: @@ -40,7 +41,6 @@ def main(cfg: DictConfig) -> None: LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) # Rename config keys to follow class structure - rename_config_keys(cfg) # Find the latest checkpoint, if requested. set_latest_model(cfg) diff --git a/scripts/runner.sh b/scripts/runner.sh new file mode 100755 index 0000000..d59b467 --- /dev/null +++ b/scripts/runner.sh @@ -0,0 +1,31 @@ +# Convenience script for launching your fine-tuned model. +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# CUDA_VISIBLE_DEVICES=x ./runner.sh user=USERNAME interfaces=cli writer/llm=transformers + +set -e + +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + else + vars[idx]=$argument + idx+=1 + fi +done + +python3 runner.py ${vars[@]} \ No newline at end of file diff --git a/src/panza3/data_preparation/extract_emails.py b/src/panza3/data_preparation/extract_emails.py index 4739d09..e9ead5e 100644 --- a/src/panza3/data_preparation/extract_emails.py +++ b/src/panza3/data_preparation/extract_emails.py @@ -1,4 +1,3 @@ -import argparse import json import mailbox import re @@ -21,6 +20,8 @@ SHORT_EMAIL_THRESHOLD = 10 # words +FORWARDED_MESSAGE_TAG = "---------- Forwarded message ---------" + def extract_only_plain_text(msg_part): if msg_part.get_content_type() == "text/plain": @@ -30,7 +31,7 @@ def extract_only_plain_text(msg_part): def skip_forwarded_messages(plain_text): - if "---------- Forwarded message ---------" in plain_text: + if FORWARDED_MESSAGE_TAG in plain_text: DISCARDED_EMAILS["forwarded"].append(plain_text) return "" else: @@ -134,7 +135,11 @@ def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_em print(f"--> processing {i}/{n_emails} <--") # Filter messages sent from your email address if message["from"] and any(email in message["from"] for email in EMAIL): - date = parsedate_to_datetime(message["Date"]).isoformat() + if message["Date"]: + date = parsedate_to_datetime(message["Date"]).isoformat() + else: + print("Date was not found in the email. Skipping.") + continue if message.is_multipart(): for part in message.walk(): filtered_msg = filter_message(part) @@ -187,15 +192,17 @@ def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_em # Save discarded emails if save_discarded_emails_path and save_discarded_emails_path != "": + print(f"\n---> Processing Discarded Emails <---") makedirs(save_discarded_emails_path, exist_ok=True) for k, v in DISCARDED_EMAILS.items(): + print(f"--> processing {k} emails <--") output_path = join(save_discarded_emails_path, f"{username}_discarded_{k}.jsonl") with open(output_path, "w", encoding="utf-8") as f: + discarded_emails = len(v) for i, item in enumerate(v): print("\n\n\n\n\===========================") if type(item) is Message or type(item) is mboxMessage: item = item.get_payload() - print(item) - print("this is number", i, output_path) + print(f"--> processing {i}/{discarded_emails} <--") json_record = json.dumps(item) f.write(json_record + "\n") diff --git a/src/panza3/data_preparation/rag.py b/src/panza3/data_preparation/rag.py index 649c281..ee03367 100644 --- a/src/panza3/data_preparation/rag.py +++ b/src/panza3/data_preparation/rag.py @@ -1,7 +1,7 @@ import copy import json import time -from abc import ABC, abstractmethod +from abc import ABC from dataclasses import asdict, dataclass from datetime import datetime from typing import Dict, List, Optional, Union @@ -13,6 +13,7 @@ from langchain_core.vectorstores import VectorStore from langchain.text_splitter import RecursiveCharacterTextSplitter + @dataclass(kw_only=True) class Email(ABC): email: str @@ -65,7 +66,7 @@ def load_vector_db_from_disk( print("Faiss index loaded ") return db except Exception as e: - print("Fiass index loading failed \n", e) + print("FAISS index loading failed \n", e) def load_emails(path: str) -> List[Email]: @@ -93,9 +94,10 @@ def process_emails(emails: List[Email], chunk_size: int, chunk_overlap: int) -> return documents - -def create_vector_store(path_to_emails, chunk_size, chunk_overlap, db_path, index_name, embedding_model): - +def create_vector_store( + path_to_emails, chunk_size, chunk_overlap, db_path, index_name, embedding_model +): + """Create FAISS vector database for search and retrieval.""" # Load emails emails = load_emails(path_to_emails) print(f"Loaded {len(emails)} emails.") @@ -116,4 +118,3 @@ def create_vector_store(path_to_emails, chunk_size, chunk_overlap, db_path, inde # Save vector DB to disk db.save_local(folder_path=db_path, index_name=index_name) print(f"Vector DB index {index_name} saved to {db_path}.") - diff --git a/src/panza3/entities/document.py b/src/panza3/entities/document.py index 7b06dd2..6bce09b 100644 --- a/src/panza3/entities/document.py +++ b/src/panza3/entities/document.py @@ -60,7 +60,9 @@ def deserialize(cls, data: Union[str, Dict]) -> "Email": def process(documents: List["Email"], chunk_size, chunk_overlap) -> List[Document]: # Convert e-mails to langchain documents documents = [ - LangchainDocument(page_content=email.email, metadata={"serialized_document": email.serialize()}) + LangchainDocument( + page_content=email.email, metadata={"serialized_document": email.serialize()} + ) for email in documents ] diff --git a/src/panza3/finetuning/preprocessing.py b/src/panza3/finetuning/preprocessing.py index 874584c..0f17aeb 100644 --- a/src/panza3/finetuning/preprocessing.py +++ b/src/panza3/finetuning/preprocessing.py @@ -9,8 +9,6 @@ PREPROCESSING_CONFIG_FILE = os.environ.get("PANZA_PREPROCESSING_CONFIG") if PREPROCESSING_CONFIG_FILE: - print("Hello from preprocessing.py") - preprocessing_config = OmegaConf.load(PREPROCESSING_CONFIG_FILE) prompt_builder = hydra.utils.instantiate(preprocessing_config.prompting) @@ -27,12 +25,10 @@ def panza_preprocessing_function(inputs: Dict) -> Dict: instruction = EmailInstruction(instruction=prompt_raw, thread=inputs.get("thread", [])) prompt = prompt_builder.build_prompt(instruction) - #print(f"Prompt: {prompt}") - # Generate the full conversation conversation = [ {"role": "user", "content": prompt}, - {"role": "assistant", "content": inputs["email"]} + {"role": "assistant", "content": inputs["email"]}, ] chat_prompt = tokenizer.apply_chat_template(conversation, tokenize=False) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index ecdd038..2d85aff 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -12,16 +12,21 @@ import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union +import spops import torch from composer import Trainer from composer.core.callback import Callback -from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, - InContextLearningLMExpectedCalibrationError, - InContextLearningMCExpectedCalibrationError, - InContextLearningMultipleChoiceAccuracy, - InContextLearningQAAccuracy, LanguageCrossEntropy, - LanguagePerplexity) +from composer.metrics.nlp import ( + InContextLearningCodeEvalAccuracy, + InContextLearningLMAccuracy, + InContextLearningLMExpectedCalibrationError, + InContextLearningMCExpectedCalibrationError, + InContextLearningMultipleChoiceAccuracy, + InContextLearningQAAccuracy, + LanguageCrossEntropy, + LanguagePerplexity, +) from composer.optim import DecoupledAdamW from composer.profiler import JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule from composer.utils import dist, get_device, reproducibility @@ -36,20 +41,28 @@ from peft import get_peft_model from peft.tuners.rosa import RosaConfig, RosaModel, RosaScheduler from rich.traceback import install -from torch.distributed.fsdp import FullStateDictConfig -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import StateDictType from transformers import AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerBase install() from llmfoundry.callbacks import AsyncEval from llmfoundry.data.dataloader import build_dataloader from llmfoundry.layers_registry import ffns_with_megablocks -from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_algorithm, build_callback, - build_composer_model, build_evaluators, build_logger, - build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, - update_batch_size_info) +from llmfoundry.utils.builders import ( + add_metrics_to_eval_loaders, + build_algorithm, + build_callback, + build_evaluators, + build_logger, + build_optimizer, + build_scheduler, + build_tokenizer, +) +from llmfoundry.utils.config_utils import ( + log_config, + pop_config, + process_init_device, + update_batch_size_info, +) from llmfoundry.utils.registry_utils import import_file import hydra @@ -63,74 +76,72 @@ def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" loaders = [cfg.train_loader] - if 'eval_loader' in cfg: + if "eval_loader" in cfg: eval_loader = cfg.eval_loader if isinstance(eval_loader, ListConfig): for loader in eval_loader: if loader.label is None: raise ValueError( - 'When specifying multiple evaluation datasets, each one must include the \ - `label` attribute.') + "When specifying multiple evaluation datasets, each one must include the \ + `label` attribute." + ) loaders.append(loader) else: loaders.append(eval_loader) for loader in loaders: - if loader.name == 'text': - if cfg.model.name == 'hf_t5': + if loader.name == "text": + if cfg.model.name == "hf_t5": raise ValueError( - f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ - f'dataloader. Only finetuning is supported.') + f'Model type "{cfg.model.name}" is not supported when using the "text " ' + + f"dataloader. Only finetuning is supported." + ) - if 'icl_tasks' in cfg: - if cfg.model.name == 'hf_t5': + if "icl_tasks" in cfg: + if cfg.model.name == "hf_t5": raise ValueError( 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' ) - if (cfg.model.get('fc_type', 'torch') != 'te' and 'te' not in cfg.model.get( - 'ffn_config', {}).get('ffn_type', 'mptmlp') and - 'fp8' in cfg.precision): + if ( + cfg.model.get("fc_type", "torch") != "te" + and "te" not in cfg.model.get("ffn_config", {}).get("ffn_type", "mptmlp") + and "fp8" in cfg.precision + ): warnings.warn( "fp8 only supported for te.Linear layers. Either set `cfg.model.fc_typ='te'` or " - + - "`cfg.model.ffn_config.ffn_type='te_ln_mlp'` to enable layers using fp8 precision." + + "`cfg.model.ffn_config.ffn_type='te_ln_mlp'` to enable layers using fp8 precision." ) - if (cfg.model.get('fc_type', 'torch') == 'te' or - 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): - fsdp_config = cfg.get('fsdp_config', None) - act_ckpt = fsdp_config.get('activation_checkpointing', False) - act_ckpt_reentrant = fsdp_config.get( - 'activation_checkpointing_reentrant', False) + if cfg.model.get("fc_type", "torch") == "te" or "te" in cfg.model.get("ffn_config", {}).get( + "ffn_type", "mptmlp" + ): + fsdp_config = cfg.get("fsdp_config", None) + act_ckpt = fsdp_config.get("activation_checkpointing", False) + act_ckpt_reentrant = fsdp_config.get("activation_checkpointing_reentrant", False) if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: warnings.warn( - '`te.Linear` layers do not support activation_checkpointing with ' - + '`activation_checkpointing_reentrant = True`. ' + - 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' + "`te.Linear` layers do not support activation_checkpointing with " + + "`activation_checkpointing_reentrant = True`. " + + "Setting cfg.fsdp_config.activation_checkpointing_reentrant=False." ) cfg.fsdp_config.activation_checkpointing_reentrant = False - if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': + if cfg.model.get("ffn_config", {}).get("ffn_type", "mptmlp") == "te_ln_mlp": warnings.warn( - '`te.LayerNormMLP` requires has issues with torch._dynamo. ' + - 'Setting `torch._dynamo.config.suppress_errors = True` and falling back to eager.' + "`te.LayerNormMLP` requires has issues with torch._dynamo. " + + "Setting `torch._dynamo.config.suppress_errors = True` and falling back to eager." ) torch._dynamo.config.suppress_errors = True # type: ignore (third-party) - if cfg.model.get('load_in_8bit', False): - raise ValueError( - '`load_in_8bit` is only supported for evaluation rather than training.' - ) + if cfg.model.get("load_in_8bit", False): + raise ValueError("`load_in_8bit` is only supported for evaluation rather than training.") - if cfg.model.get('ffn_config', {}).get('ffn_type', - 'mptmlp') in ffns_with_megablocks: - moe_world_size = cfg.model.get('ffn_config', - {}).get('moe_world_size', 1) - use_orig_params = cfg.get('fsdp_config', - {}).get('use_orig_params', True) + if cfg.model.get("ffn_config", {}).get("ffn_type", "mptmlp") in ffns_with_megablocks: + moe_world_size = cfg.model.get("ffn_config", {}).get("moe_world_size", 1) + use_orig_params = cfg.get("fsdp_config", {}).get("use_orig_params", True) if moe_world_size > 1 and not use_orig_params: raise ValueError( - f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' + f"MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`." ) @@ -169,7 +180,7 @@ def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: rosa_cfg.mask_load_path = None rosa_cfg.mask_save_path = mask_path rosa_cfg.terminate_after_mask_generation = True - rosa_cfg.mask_gen_model_precision = 'amp_bf16' + rosa_cfg.mask_gen_model_precision = "amp_bf16" else: if rosa_cfg.spa_d > 0 and rosa_cfg.lora_r != 0: rosa_cfg.schedule = "default" @@ -187,10 +198,6 @@ def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: OmegaConf.set_struct(rosa_cfg, True) -def create_experiment_yaml() -> str: - pass - - def create_checkpoint_dirs(cfg: DictConfig) -> None: # Create model directory os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) @@ -246,69 +253,74 @@ def save_config_to_yaml(cfg: DictConfig) -> str: def build_composer_peft_model( - model_config: str, rosa_config: Dict[str, Any], - tokenizer: PreTrainedTokenizerBase, is_fsdp: bool = False) -> ComposerHFCausalLM: + model_config: str, + rosa_config: Dict[str, Any], + tokenizer: PreTrainedTokenizerBase, + is_fsdp: bool = False, +) -> ComposerHFCausalLM: # 1) loads a hf model, 2) adds peft modules, 3) wraps it in a ComposerHFCausalLM. - print('Building model from HuggingFace checkpoint...') + print("Building model from HuggingFace checkpoint...") - weight_bias_dtype = model_config.get('weight_bias_dtype', None) - if weight_bias_dtype == '4bit': + weight_bias_dtype = model_config.get("weight_bias_dtype", None) + if weight_bias_dtype == "4bit": compute_dtype = torch.bfloat16 quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4', + bnb_4bit_quant_type="nf4", ) - elif weight_bias_dtype == 'bf16': - assert weight_bias_dtype == 'bf16', 'Only bf16 is supported for now' - compute_dtype = torch.bfloat16 - quant_config = None + elif weight_bias_dtype == "bf16": + assert weight_bias_dtype == "bf16", "Only bf16 is supported for now" + compute_dtype = torch.bfloat16 + quant_config = None else: - assert weight_bias_dtype == 'fp32' + assert weight_bias_dtype == "fp32" compute_dtype = torch.float32 quant_config = None with init_empty_weights(include_buffers=False): model = AutoModelForCausalLM.from_pretrained( model_config.pretrained_model_name_or_path, - device_map='cpu' if quant_config is None else 'auto', + device_map="cpu" if quant_config is None else "auto", torch_dtype=compute_dtype, # load_in_4bit=weight_bias_dtype == '4bit', quantization_config=quant_config, trust_remote_code=True, use_auth_token=True, use_cache=False, - attn_implementation='eager' + attn_implementation="eager", ) - print('Model built!') + print("Model built!") if rosa_config is not None: - print('Building RoSA config...') + print("Building RoSA config...") config = RosaConfig( - r=rosa_config['lora_r'], - d=rosa_config['spa_d'], - lora_alpha=rosa_config.get('lora_alpha', 16), - target_modules=rosa_config.get('target_modules', 'all-linear'), - lora_dropout=rosa_config.get('lora_dropout', 0.05), - impl=rosa_config.get('impl', 'auto'), - spa_store_transpose=rosa_config.get('spa_store_transpose', True), - rosa_dtype=rosa_config.get('rosa_dtype', True), - spa_num_grads=rosa_config.get('spa_num_grads', 1), - grad_acc_mode=rosa_config.get('grad_acc_mode', 'mean_squared'), - grad_4bit_accum=rosa_config.get('grad_4bit_accum', False), - mask_load_path=rosa_config.get('mask_load_path', None), - mask_save_path=rosa_config.get('mask_save_path', None), - terminate_after_mask_generation=rosa_config.get('terminate_after_mask_generation', False), - schedule=rosa_config.get('schedule', 'df'), + r=rosa_config["lora_r"], + d=rosa_config["spa_d"], + lora_alpha=rosa_config.get("lora_alpha", 16), + target_modules=rosa_config.get("target_modules", "all-linear"), + lora_dropout=rosa_config.get("lora_dropout", 0.05), + impl=rosa_config.get("impl", "auto"), + spa_store_transpose=rosa_config.get("spa_store_transpose", True), + rosa_dtype=rosa_config.get("rosa_dtype", True), + spa_num_grads=rosa_config.get("spa_num_grads", 1), + grad_acc_mode=rosa_config.get("grad_acc_mode", "mean_squared"), + grad_4bit_accum=rosa_config.get("grad_4bit_accum", False), + mask_load_path=rosa_config.get("mask_load_path", None), + mask_save_path=rosa_config.get("mask_save_path", None), + terminate_after_mask_generation=rosa_config.get( + "terminate_after_mask_generation", False + ), + schedule=rosa_config.get("schedule", "df"), bias="none", task_type="CAUSAL_LM", ) - #raise ValueError(config) - print('Adding RoSA modules...') + # raise ValueError(config) + print("Adding RoSA modules...") model = get_peft_model(model, config) - print('RoSA modules added!') + print("RoSA modules added!") train_metrics = [LanguageCrossEntropy(), LanguagePerplexity()] eval_metrics = [ @@ -319,7 +331,7 @@ def build_composer_peft_model( InContextLearningQAAccuracy(), InContextLearningCodeEvalAccuracy(), InContextLearningLMExpectedCalibrationError(), - InContextLearningMCExpectedCalibrationError() + InContextLearningMCExpectedCalibrationError(), ] model = HuggingFaceModelWithFSDP( @@ -328,14 +340,12 @@ def build_composer_peft_model( tokenizer=tokenizer, metrics=train_metrics, eval_metrics=eval_metrics, - init_device='cpu', - peft_config=None + init_device="cpu", + peft_config=None, ) - - # model = ComposerHFCausalLM(model, tokenizer) - # model = ModelComposerHFCausalLM(model, tokenizer) return model + @hydra.main(version_base="1.1", config_path="../../../configs", config_name="panza_finetuning") def main(cfg: DictConfig) -> Trainer: override_config(cfg) @@ -351,10 +361,7 @@ def main(cfg: DictConfig) -> Trainer: cfg.preprocessing.model = cfg.finetuning.model_name_or_path preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) - #create_checkpoint_dirs(cfg) environment = os.environ - # I don't think we need this, since panza is loaded with pip. - #environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" environment["WANDB_DISABLED"] = str(int(cfg.finetuning.wandb_disabled)) environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml @@ -365,21 +372,16 @@ def main(cfg: DictConfig) -> Trainer: OmegaConf.set_struct(cfg, False) # Run user provided code if specified - code_paths = pop_config(cfg, - 'code_paths', - must_exist=False, - default_value=[], - convert=True) + code_paths = pop_config(cfg, "code_paths", must_exist=False, default_value=[], convert=True) # Import any user provided code for code_path in code_paths: import_file(code_path) # Filter deprecation warning from torch internal usage warnings.filterwarnings( - action='ignore', + action="ignore", category=UserWarning, - message= - 'torch.distributed.*_base is a private function and will be deprecated.*' + message="torch.distributed.*_base is a private function and will be deprecated.*", ) # Check for incompatibilities between the model and data loaders @@ -393,32 +395,31 @@ def main(cfg: DictConfig) -> Trainer: cuda_alloc_conf = [] # Get max split size mb - max_split_size_mb: Optional[int] = cfg.pop('max_split_size_mb', None) + max_split_size_mb: Optional[int] = cfg.pop("max_split_size_mb", None) if max_split_size_mb is not None: - cuda_alloc_conf.append(f'max_split_size_mb:{max_split_size_mb}') + cuda_alloc_conf.append(f"max_split_size_mb:{max_split_size_mb}") # Expandable segments - if cfg.pop('expandable_segments', False): - cuda_alloc_conf.append('expandable_segments:True') + if cfg.pop("expandable_segments", False): + cuda_alloc_conf.append("expandable_segments:True") if len(cuda_alloc_conf) > 0: - os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(cuda_alloc_conf) + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = ",".join(cuda_alloc_conf) # Set CUDA lazy loading # This can save a bit of memory if not all modules are needed - cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False) + cuda_load_lazy: bool = cfg.pop("cuda_load_lazy", False) if cuda_load_lazy: - os.environ['CUDA_MODULE_LOADING'] = 'LAZY' + os.environ["CUDA_MODULE_LOADING"] = "LAZY" # Set seed first - seed: int = pop_config(cfg, 'seed', must_exist=True) + seed: int = pop_config(cfg, "seed", must_exist=True) reproducibility.seed_all(seed) # Initialize pytorch distributed training process groups - dist_timeout: Union[int, float] = pop_config(cfg, - 'dist_timeout', - must_exist=False, - default_value=600.0) + dist_timeout: Union[int, float] = pop_config( + cfg, "dist_timeout", must_exist=False, default_value=600.0 + ) dist.initialize_dist(get_device(None), timeout=dist_timeout) # Get global and device batch size information from distributed/single node setting @@ -426,244 +427,175 @@ def main(cfg: DictConfig) -> Trainer: logged_cfg.update(cfg, merge=True) # Mandatory model training configs - model_config: DictConfig = pop_config(cfg, 'model', must_exist=True) - tokenizer_config: Dict[str, Any] = pop_config(cfg, - 'tokenizer', - must_exist=True, - convert=True) - optimizer_config: Dict[str, Any] = pop_config(cfg, - 'optimizer', - must_exist=True, - convert=True) - scheduler_config: Dict[str, Any] = pop_config(cfg, - 'scheduler', - must_exist=True, - convert=True) - train_loader_config: DictConfig = pop_config(cfg, - 'train_loader', - must_exist=True) + model_config: DictConfig = pop_config(cfg, "model", must_exist=True) + tokenizer_config: Dict[str, Any] = pop_config(cfg, "tokenizer", must_exist=True, convert=True) + optimizer_config: Dict[str, Any] = pop_config(cfg, "optimizer", must_exist=True, convert=True) + scheduler_config: Dict[str, Any] = pop_config(cfg, "scheduler", must_exist=True, convert=True) + train_loader_config: DictConfig = pop_config(cfg, "train_loader", must_exist=True) # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'fsdp_config', - must_exist=False, - default_value=None, - convert=True) - - ds_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'ds_config', - must_exist=False, - default_value=None, - convert=True) - - rosa_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'rosa', - must_exist=False, - default_value=None, - convert=True) - - hf_save_path: Union[int, str] = pop_config(cfg, - 'hf_save_path', - must_exist=True) + fsdp_config: Optional[Dict[str, Any]] = pop_config( + cfg, "fsdp_config", must_exist=False, default_value=None, convert=True + ) + + ds_config: Optional[Dict[str, Any]] = pop_config( + cfg, "ds_config", must_exist=False, default_value=None, convert=True + ) + + rosa_config: Optional[Dict[str, Any]] = pop_config( + cfg, "rosa", must_exist=False, default_value=None, convert=True + ) + + hf_save_path: Union[int, str] = pop_config(cfg, "hf_save_path", must_exist=True) eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( - cfg, 'eval_loader', must_exist=False, default_value=None) - icl_tasks_config: Optional[Union[ListConfig, - str]] = pop_config(cfg, - 'icl_tasks', - must_exist=False, - default_value=None) - eval_gauntlet_config: Optional[Union[DictConfig, - str]] = pop_config(cfg, - 'eval_gauntlet', - must_exist=False, - default_value=None) - icl_subset_num_batches: Optional[int] = pop_config(cfg, - 'icl_subset_num_batches', - must_exist=False, - default_value=None) - icl_seq_len: Optional[int] = pop_config(cfg, - 'icl_seq_len', - must_exist=False, - default_value=None) + cfg, "eval_loader", must_exist=False, default_value=None + ) + icl_tasks_config: Optional[Union[ListConfig, str]] = pop_config( + cfg, "icl_tasks", must_exist=False, default_value=None + ) + eval_gauntlet_config: Optional[Union[DictConfig, str]] = pop_config( + cfg, "eval_gauntlet", must_exist=False, default_value=None + ) + icl_subset_num_batches: Optional[int] = pop_config( + cfg, "icl_subset_num_batches", must_exist=False, default_value=None + ) + icl_seq_len: Optional[int] = pop_config( + cfg, "icl_seq_len", must_exist=False, default_value=None + ) # Optional logging, evaluation and callback configs - logger_configs: Optional[DictConfig] = pop_config(cfg, - 'loggers', - must_exist=False, - default_value=None, - convert=True) - callback_configs: Optional[DictConfig] = pop_config(cfg, - 'callbacks', - must_exist=False, - default_value=None, - convert=True) - algorithm_configs: Optional[DictConfig] = pop_config(cfg, - 'algorithms', - must_exist=False, - default_value=None) + logger_configs: Optional[DictConfig] = pop_config( + cfg, "loggers", must_exist=False, default_value=None, convert=True + ) + callback_configs: Optional[DictConfig] = pop_config( + cfg, "callbacks", must_exist=False, default_value=None, convert=True + ) + algorithm_configs: Optional[DictConfig] = pop_config( + cfg, "algorithms", must_exist=False, default_value=None + ) # Mandatory hyperparameters for training - device_train_batch_size: int = pop_config(cfg, - 'device_train_batch_size', - must_exist=True) - device_eval_batch_size: int = pop_config(cfg, - 'device_eval_batch_size', - must_exist=True) - max_duration: Union[int, str] = pop_config(cfg, - 'max_duration', - must_exist=True) - eval_interval: Union[int, str] = pop_config(cfg, - 'eval_interval', - default_value=1, - must_exist=False) - precision: str = pop_config(cfg, 'precision', must_exist=True) - max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) + device_train_batch_size: int = pop_config(cfg, "device_train_batch_size", must_exist=True) + device_eval_batch_size: int = pop_config(cfg, "device_eval_batch_size", must_exist=True) + max_duration: Union[int, str] = pop_config(cfg, "max_duration", must_exist=True) + eval_interval: Union[int, str] = pop_config( + cfg, "eval_interval", default_value=1, must_exist=False + ) + precision: str = pop_config(cfg, "precision", must_exist=True) + max_seq_len: int = pop_config(cfg, "max_seq_len", must_exist=True) # Optional parameters will be set to default values if not specified. - default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name: str = pop_config(cfg, - 'run_name', - must_exist=False, - default_value=default_run_name) - save_folder: Optional[str] = pop_config(cfg, - 'save_folder', - must_exist=False, - default_value=None) - is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') - == 'sharded') if fsdp_config else False + default_run_name: str = os.environ.get("RUN_NAME", "llm") + run_name: str = pop_config(cfg, "run_name", must_exist=False, default_value=default_run_name) + save_folder: Optional[str] = pop_config( + cfg, "save_folder", must_exist=False, default_value=None + ) + is_state_dict_sharded: bool = ( + (fsdp_config.get("state_dict_type", "full") == "sharded") if fsdp_config else False + ) save_latest_filename: str = pop_config( cfg, - 'save_latest_filename', + "save_latest_filename", must_exist=False, - default_value='latest-sharded-rank{rank}' - if is_state_dict_sharded else 'latest-rank{rank}.pt') - save_overwrite: bool = pop_config(cfg, - 'save_overwrite', - must_exist=False, - default_value=False) - save_weights_only: bool = pop_config(cfg, - 'save_weights_only', - must_exist=False, - default_value=False) + default_value=( + "latest-sharded-rank{rank}" if is_state_dict_sharded else "latest-rank{rank}.pt" + ), + ) + save_overwrite: bool = pop_config(cfg, "save_overwrite", must_exist=False, default_value=False) + save_weights_only: bool = pop_config( + cfg, "save_weights_only", must_exist=False, default_value=False + ) save_filename: str = pop_config( - cfg, - 'save_filename', - must_exist=False, - default_value='ep{epoch}-ba{batch}-rank{rank}.pt') - save_interval: Union[str, int] = pop_config(cfg, - 'save_interval', - must_exist=False, - default_value='1000ba') + cfg, "save_filename", must_exist=False, default_value="ep{epoch}-ba{batch}-rank{rank}.pt" + ) + save_interval: Union[str, int] = pop_config( + cfg, "save_interval", must_exist=False, default_value="1000ba" + ) save_num_checkpoints_to_keep: int = pop_config( - cfg, 'save_num_checkpoints_to_keep', must_exist=False, default_value=-1) - progress_bar = pop_config(cfg, - 'progress_bar', - must_exist=False, - default_value=False) - log_to_console: bool = pop_config(cfg, - 'log_to_console', - must_exist=False, - default_value=True) - python_log_level: Optional[str] = pop_config(cfg, - 'python_log_level', - must_exist=False, - default_value='debug') - console_log_interval: Union[int, str] = pop_config(cfg, - 'console_log_interval', - must_exist=False, - default_value='1ba') + cfg, "save_num_checkpoints_to_keep", must_exist=False, default_value=-1 + ) + progress_bar = pop_config(cfg, "progress_bar", must_exist=False, default_value=False) + log_to_console: bool = pop_config(cfg, "log_to_console", must_exist=False, default_value=True) + python_log_level: Optional[str] = pop_config( + cfg, "python_log_level", must_exist=False, default_value="debug" + ) + console_log_interval: Union[int, str] = pop_config( + cfg, "console_log_interval", must_exist=False, default_value="1ba" + ) device_train_microbatch_size: Union[str, int] = pop_config( - cfg, - 'device_train_microbatch_size', - must_exist=False, - default_value='auto') - eval_subset_num_batches: int = pop_config(cfg, - 'eval_subset_num_batches', - must_exist=False, - default_value=-1) - eval_first: bool = pop_config(cfg, - 'eval_first', - must_exist=False, - default_value=False) - load_path: str = pop_config(cfg, - 'load_path', - must_exist=False, - default_value=None) - load_weights_only: bool = pop_config(cfg, - 'load_weights_only', - must_exist=False, - default_value=False) - load_strict_model_weights: bool = pop_config(cfg, - 'load_strict_model_weights', - must_exist=False, - default_value=True) - load_ignore_keys: Optional[List[str]] = pop_config(cfg, - 'load_ignore_keys', - must_exist=False, - default_value=None) - save_ignore_keys: Optional[List[str]] = pop_config(cfg, - 'save_ignore_keys', - must_exist=False, - default_value=None) - compile_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'compile_config', - must_exist=False, - default_value=None) - metadata: Optional[Dict[str, str]] = pop_config(cfg, - 'metadata', - must_exist=False, - default_value=None, - convert=True) - should_log_config: bool = pop_config(cfg, - 'log_config', - must_exist=False, - default_value=True) - - num_cpu_threads: Optional[int] = cfg.pop('num_cpu_threads', 0) + cfg, "device_train_microbatch_size", must_exist=False, default_value="auto" + ) + eval_subset_num_batches: int = pop_config( + cfg, "eval_subset_num_batches", must_exist=False, default_value=-1 + ) + eval_first: bool = pop_config(cfg, "eval_first", must_exist=False, default_value=False) + load_path: str = pop_config(cfg, "load_path", must_exist=False, default_value=None) + load_weights_only: bool = pop_config( + cfg, "load_weights_only", must_exist=False, default_value=False + ) + load_strict_model_weights: bool = pop_config( + cfg, "load_strict_model_weights", must_exist=False, default_value=True + ) + load_ignore_keys: Optional[List[str]] = pop_config( + cfg, "load_ignore_keys", must_exist=False, default_value=None + ) + save_ignore_keys: Optional[List[str]] = pop_config( + cfg, "save_ignore_keys", must_exist=False, default_value=None + ) + compile_config: Optional[Dict[str, Any]] = pop_config( + cfg, "compile_config", must_exist=False, default_value=None + ) + metadata: Optional[Dict[str, str]] = pop_config( + cfg, "metadata", must_exist=False, default_value=None, convert=True + ) + should_log_config: bool = pop_config(cfg, "log_config", must_exist=False, default_value=True) + + num_cpu_threads: Optional[int] = cfg.pop("num_cpu_threads", 0) if num_cpu_threads > 0: - print(f'Setting number of CPU threads to {num_cpu_threads}') - import spops + print(f"Setting number of CPU threads to {num_cpu_threads}") torch.set_num_threads(num_cpu_threads) spops.set_num_threads(num_cpu_threads) # Enable autoresume from model checkpoints if possible autoresume_default: bool = False - if logged_cfg.get('run_name', None) is not None \ - and save_folder is not None \ - and not save_overwrite \ - and not save_weights_only: + if ( + logged_cfg.get("run_name", None) is not None + and save_folder is not None + and not save_overwrite + and not save_weights_only + ): autoresume_default = True - if cfg.get('autoresume') is None and autoresume_default: - log.info('As run_name, save_folder, and save_latest_filename are set, \ - changing autoresume default to True...') + if cfg.get("autoresume") is None and autoresume_default: + log.info( + "As run_name, save_folder, and save_latest_filename are set, \ + changing autoresume default to True..." + ) - autoresume: bool = pop_config(cfg, - 'autoresume', - must_exist=False, - default_value=autoresume_default) + autoresume: bool = pop_config( + cfg, "autoresume", must_exist=False, default_value=autoresume_default + ) # Pop known unused parameters that are used as interpolation variables or # created by update_batch_size_info. - pop_config(cfg, 'data_local', must_exist=False) - pop_config(cfg, 'data_remote', must_exist=False) - pop_config(cfg, 'global_seed', must_exist=False) - pop_config(cfg, 'global_train_batch_size', must_exist=False) - pop_config(cfg, 'n_gpus', must_exist=False) - pop_config(cfg, 'device_train_grad_accum', must_exist=False) + pop_config(cfg, "data_local", must_exist=False) + pop_config(cfg, "data_remote", must_exist=False) + pop_config(cfg, "global_seed", must_exist=False) + pop_config(cfg, "global_train_batch_size", must_exist=False) + pop_config(cfg, "n_gpus", must_exist=False) + pop_config(cfg, "device_train_grad_accum", must_exist=False) - assert fsdp_config is None or ds_config is None, 'fsdp and deepspeed are not supported together' + assert fsdp_config is None or ds_config is None, "fsdp and deepspeed are not supported together" # Warn users for unused parameters for key in cfg: warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' + f"Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary." ) # Warn if fsdp is enabled but user only has 1 GPU if dist.get_world_size() == 1 and fsdp_config is not None: - warnings.warn( - 'FSDP is not applicable for single-GPU training. Reverting to DDP.') + warnings.warn("FSDP is not applicable for single-GPU training. Reverting to DDP.") fsdp_config = None # set logging level @@ -671,34 +603,32 @@ def main(cfg: DictConfig) -> Trainer: logging.basicConfig( # Example of format string # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + format=f"%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s" ) - logging.getLogger('llmfoundry').setLevel( - python_log_level.upper()) # Foundry module - logging.getLogger(__name__).setLevel( - python_log_level.upper()) # Train script + logging.getLogger("llmfoundry").setLevel(python_log_level.upper()) # Foundry module + logging.getLogger(__name__).setLevel(python_log_level.upper()) # Train script # Initialize context init_context = process_init_device(model_config, fsdp_config) - logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) + logged_cfg.update({"fsdp_config": fsdp_config}, merge=True) # Build tokenizer - log.info('Building tokenizer...') - tokenizer_name = tokenizer_config['name'] - tokenizer_kwargs = tokenizer_config.get('kwargs', {}) + log.info("Building tokenizer...") + tokenizer_name = tokenizer_config["name"] + tokenizer_kwargs = tokenizer_config.get("kwargs", {}) tokenizer_kwargs["num_proc"] = 1 tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # Scheduler - scheduler_name: str = scheduler_config.pop('name') + scheduler_name: str = scheduler_config.pop("name") scheduler = build_scheduler(scheduler_name, scheduler_config) # Loggers - loggers = [ - build_logger(str(name), logger_cfg) - for name, logger_cfg in logger_configs.items() - ] if logger_configs else [] + loggers = ( + [build_logger(str(name), logger_cfg) for name, logger_cfg in logger_configs.items()] + if logger_configs + else [] + ) mosaicml_logger = find_mosaicml_logger(loggers) if mosaicml_logger is None: @@ -709,7 +639,7 @@ def main(cfg: DictConfig) -> Trainer: if metadata is not None: # Flatten the metadata for logging - logged_cfg.pop('metadata', None) + logged_cfg.pop("metadata", None) logged_cfg.update(metadata, merge=True) if mosaicml_logger is not None: mosaicml_logger.log_metrics(metadata) @@ -717,59 +647,65 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg: Optional[DictConfig] = pop_config(cfg, - 'profiler', - must_exist=False, - convert=False, - default_value=None) + profiler_cfg: Optional[DictConfig] = pop_config( + cfg, "profiler", must_exist=False, convert=False, default_value=None + ) if profiler_cfg: - profiler_schedule_cfg: Dict = pop_config(profiler_cfg, - 'schedule', - must_exist=True, - convert=True) + profiler_schedule_cfg: Dict = pop_config( + profiler_cfg, "schedule", must_exist=True, convert=True + ) profiler_schedule = cyclic_schedule(**profiler_schedule_cfg) # Only support json trace handler profiler_trace_handlers: List[TraceHandler] = [] - profiler_trace_cfg: Optional[Dict] = pop_config(profiler_cfg, - 'json_trace_handler', - must_exist=False, - default_value=None, - convert=True) + profiler_trace_cfg: Optional[Dict] = pop_config( + profiler_cfg, "json_trace_handler", must_exist=False, default_value=None, convert=True + ) if profiler_trace_cfg: - profiler_trace_handlers.append( - JSONTraceHandler(**profiler_trace_cfg)) - profiler = Profiler(**profiler_cfg, - trace_handlers=profiler_trace_handlers, - schedule=profiler_schedule) + profiler_trace_handlers.append(JSONTraceHandler(**profiler_trace_cfg)) + profiler = Profiler( + **profiler_cfg, trace_handlers=profiler_trace_handlers, schedule=profiler_schedule + ) # Callbacks - callbacks: List[Callback] = [ - build_callback(str(name), callback_cfg, om.to_container(logged_cfg)) - for name, callback_cfg in callback_configs.items() - ] if callback_configs else [] + callbacks: List[Callback] = ( + [ + build_callback(str(name), callback_cfg, om.to_container(logged_cfg)) + for name, callback_cfg in callback_configs.items() + ] + if callback_configs + else [] + ) use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) - print('ROSA CONFIG', rosa_config) + print("ROSA CONFIG", rosa_config) # Build Model - print('Initializing model...') + print("Initializing model...") with init_context: - assert fsdp_config is None or rosa_config is None, 'fsdp is cuurently not supported with RoSA' - model = build_composer_peft_model(model_config, rosa_config, tokenizer, is_fsdp=fsdp_config is not None) + assert ( + fsdp_config is None or rosa_config is None + ), "fsdp is cuurently not supported with RoSA" + model = build_composer_peft_model( + model_config, rosa_config, tokenizer, is_fsdp=fsdp_config is not None + ) if rosa_config is not None: assert isinstance(model.model.base_model, RosaModel) # Algorithms - algorithms = [ - build_algorithm(str(name), algorithm_cfg) - for name, algorithm_cfg in algorithm_configs.items() - ] if algorithm_configs else [] + algorithms = ( + [ + build_algorithm(str(name), algorithm_cfg) + for name, algorithm_cfg in algorithm_configs.items() + ] + if algorithm_configs + else [] + ) if rosa_config is not None: algorithms.append(RosaScheduler(model.model.base_model)) # Dataloaders - log.info('Building train loader...') + log.info("Building train loader...") try: disable_caching() train_loader = build_dataloader( @@ -778,24 +714,23 @@ def main(cfg: DictConfig) -> Trainer: device_train_batch_size, ) except Exception as e: + print("I am I here") if mosaicml_logger is not None: mosaicml_logger.log_exception(e) raise e if mosaicml_logger is not None: - mosaicml_logger.log_metrics({'data_validated': time.time()}) + mosaicml_logger.log_metrics({"data_validated": time.time()}) ## Evaluation if use_async_eval: evaluators = [] if eval_first: - warnings.warn( - 'AsyncEval callback does not support eval_first=True. Ignoring.' - ) + warnings.warn("AsyncEval callback does not support eval_first=True. Ignoring.") eval_first = False else: - log.info('Building eval loader...') + log.info("Building eval loader...") eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len evaluators, _, eval_gauntlet_callback = build_evaluators( eval_loader_config, @@ -810,84 +745,76 @@ def main(cfg: DictConfig) -> Trainer: callbacks.append(eval_gauntlet_callback) if mosaicml_logger is not None: - log_train_analytics(mosaicml_logger, model_config, train_loader_config, - eval_loader_config, callback_configs, - tokenizer_name, load_path, icl_tasks_config, - eval_gauntlet_config) - # # Build Model - # log.info('Initializing model...') - # model = build_composer_model( - # name=model_config.name, - # cfg=model_config, - # tokenizer=tokenizer, - # init_context=init_context, - # master_weights_dtype=model_config.get('master_weights_dtype', None), - # ) - + log_train_analytics( + mosaicml_logger, + model_config, + train_loader_config, + eval_loader_config, + callback_configs, + tokenizer_name, + load_path, + icl_tasks_config, + eval_gauntlet_config, + ) # Log number of parameters - if hasattr(model, 'n_total_params'): + if hasattr(model, "n_total_params"): n_params = model.n_total_params n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. else: n_params = sum(p.numel() for p in model.parameters()) - n_trainable_params = sum( - p.numel() for p in model.parameters() if p.requires_grad) - if hasattr(model, 'n_active_params'): + n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + if hasattr(model, "n_active_params"): n_active_params = model.n_active_params else: n_active_params = n_params - logged_cfg.update({ - 'n_params': n_params, - 'n_active_params': n_active_params, - 'n_trainable_params': n_trainable_params, - }) + logged_cfg.update( + { + "n_params": n_params, + "n_active_params": n_active_params, + "n_trainable_params": n_trainable_params, + } + ) # Optimizer - optimizer_name: str = optimizer_config.pop('name') - if rosa_config is None or 'lora_lr' not in rosa_config: + optimizer_name: str = optimizer_config.pop("name") + if rosa_config is None or "lora_lr" not in rosa_config: optimizer = build_optimizer(model, optimizer_name, optimizer_config) else: print(f'Using a different learning rate for lora params {rosa_config["lora_lr"]}') - assert optimizer_name == 'decoupled_adamw' + assert optimizer_name == "decoupled_adamw" lora_params = [] other_params = [] for name, param in model.named_parameters(): - if any([k in name for k in ['rosa_A', 'rosa_B', 'rosa_embedding_A', 'rosa_embedding_B']]): + if any( + [k in name for k in ["rosa_A", "rosa_B", "rosa_embedding_A", "rosa_embedding_B"]] + ): lora_params.append(param) else: other_params.append(param) - print(f'Found {len(lora_params)} lora params and {len(other_params)} other params') - params = [ - {'params': other_params}, - {'params': lora_params, 'lr': rosa_config['lora_lr']} - ] + print(f"Found {len(lora_params)} lora params and {len(other_params)} other params") + params = [{"params": other_params}, {"params": lora_params, "lr": rosa_config["lora_lr"]}] optimizer = DecoupledAdamW(params, **optimizer_config) - - # Now add the eval metrics try: if eval_loader_config is not None and not use_async_eval: eval_metrics = model.get_metrics(is_train=False) non_icl_metrics = [ - metric_name for metric_name, metric in eval_metrics.items() + metric_name + for metric_name, metric in eval_metrics.items() if not isinstance(metric, InContextLearningMetric) ] - evaluators = add_metrics_to_eval_loaders(evaluators, - non_icl_metrics) + evaluators = add_metrics_to_eval_loaders(evaluators, non_icl_metrics) except Exception as e: if mosaicml_logger is not None: mosaicml_logger.log_exception(e) raise e # Build the Trainer - log.info('Building trainer...') + log.info("Building trainer...") dtypes = {x.dtype for x in model.parameters()} print(dtypes) - #raise ValueError(dtypes) - #raise ValueError(model.dtype) - #raise ValueError([save_folder, save_overwrite, save_filename, save_latest_filename, save_interval, save_overwrite]) trainer = Trainer( run_name=run_name, seed=seed, @@ -929,7 +856,7 @@ def main(cfg: DictConfig) -> Trainer: ) if should_log_config: - log.info('Logging config') + log.info("Logging config") log_config(logged_cfg) torch.cuda.empty_cache() gc.collect() @@ -938,7 +865,7 @@ def main(cfg: DictConfig) -> Trainer: if eval_first and trainer.state.timestamp.batch.value == 0: trainer.eval() - log.info('Starting training...') + log.info("Starting training...") trainer.fit() # Hacky solution for moving the model checkpoint from the @@ -955,36 +882,18 @@ def main(cfg: DictConfig) -> Trainer: # if rosa is enabled, save the model manually, since # llm-foundry's checkpointing doesn't work properly with RoSA if rosa_config is not None: - assert fsdp_config is None, 'fsdp is cuurently not supported with RoSA' + assert fsdp_config is None, "fsdp is cuurently not supported with RoSA" path_to_save = os.path.join(hf_save_path, run_name) - print(f'saving the model to {path_to_save}') + print(f"saving the model to {path_to_save}") if torch.distributed.get_rank() == 0: - model.model.save_pretrained(path_to_save, is_main_process=True, state_dict=model.model.state_dict()) + model.model.save_pretrained( + path_to_save, is_main_process=True, state_dict=model.model.state_dict() + ) tokenizer.save_pretrained(path_to_save) - # print('Saving directly into HF-friendly format') - - # path_to_save = os.path.join(hf_save_path, run_name) - # print('saving the model.') - # if fsdp_config is None: - # model.model.save_pretrained(path_to_save, is_main_process=torch.distributed.get_rank() == 0, state_dict=model.model.state_dict()) - # else: - # with FSDP.summon_full_params(model.model, writeback=False, rank0_only=True, offload_to_cpu=True): - # model_to_save = model.model - # model_to_save.save_pretrained(path_to_save, state_dict=model_to_save.state_dict()) - - # if torch.distributed.get_rank() == 0: - # tokenizer.save_pretrained(path_to_save) - - # # NOTE: for some reason the saving code above would create empty pytorch_model.bin file, so we delete it manually - # # TODO: figure out why this happens - # if torch.distributed.get_rank() == 0 and os.path.exists(os.path.join(path_to_save, "pytorch_model.bin")): - # tmp = torch.load(os.path.join(path_to_save, "pytorch_model.bin")) - # if not tmp: # empty dict, remove it - # os.remove(os.path.join(path_to_save, "pytorch_model.bin")) - - log.info('Done.') + log.info("Done.") return trainer -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/src/panza3/interface/json.py b/src/panza3/interface/json.py index 2a64e77..85efb73 100644 --- a/src/panza3/interface/json.py +++ b/src/panza3/interface/json.py @@ -1,4 +1,4 @@ -from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.entities.instruction import EmailInstruction from panza3.writer import PanzaWriter import json @@ -7,60 +7,72 @@ import re from tqdm import tqdm - from evaluate import load from torchmetrics.text.bleu import BLEUScore from torchmetrics.text.rouge import ROUGEScore import string + punc_table = str.maketrans({key: None for key in string.punctuation}) rouge = ROUGEScore() bleu1 = BLEUScore(n_gram=1) bleu2 = BLEUScore(n_gram=2) bleu3 = BLEUScore(n_gram=3) bleu4 = BLEUScore(n_gram=4) -mauve = load('mauve') -from tqdm import tqdm +mauve = load("mauve") + def compute_rouge_scores(predictions, goldens): - goldens= [" ".join(x.translate(punc_table).lower().split()) for x in goldens] - candidates = [" ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions] - scores = [{k: v.item() for k, v in rouge(candidate, goldens).items()} for candidate in candidates] + goldens = [" ".join(x.translate(punc_table).lower().split()) for x in goldens] + candidates = [ + " ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions + ] + scores = [ + {k: v.item() for k, v in rouge(candidate, goldens).items()} for candidate in candidates + ] return scores + def compute_bleu_scores(predictions, goldens): - goldens= [" ".join(x.translate(punc_table).lower().split()) for x in goldens] - candidates = [" ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions] - bleu_scores = [np.mean([bleu([candidate], [goldens]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) for candidate in candidates] + goldens = [" ".join(x.translate(punc_table).lower().split()) for x in goldens] + candidates = [ + " ".join(prediction.translate(punc_table).lower().split()) for prediction in predictions + ] + bleu_scores = [ + np.mean([bleu([candidate], [goldens]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) + for candidate in candidates + ] return [s.item() for s in bleu_scores] + def compute_mauve_score(predictions, goldens): - predictions = [prediction for nested_prediction in predictions for prediction in nested_prediction] + predictions = [ + prediction for nested_prediction in predictions for prediction in nested_prediction + ] goldens = [golden for nested_golden in goldens for golden in nested_golden] mauve_score = mauve.compute(predictions=predictions, references=goldens) return mauve_score class PanzaJSON: - def compose_output_folder(self, json_path, checkpoint, panza_workspace, username): if os.path.isdir(checkpoint): # Presumably this is a Panza-trained model; go ahead # and put the json output into the same folder. output_dir = checkpoint - else: + else: # Assume that this is a huggingface model identified by its hf handle. # We don't want to populate the cached model folder, so instead # we create a folder in the Panza workspace to put the output. - output_dir = os.path.join(panza_workspace, "checkpoints", "models", checkpoint, username) + output_dir = os.path.join( + panza_workspace, "checkpoints", "models", checkpoint, username + ) os.makedirs(output_dir, exist_ok=True) filename_no_ext = os.path.splitext(os.path.basename(json_path))[0] return os.path.join(output_dir, f"{filename_no_ext}_outputs.json") + def assemble_responses(self, prompts_json, batch_size, use_thread, responses_per_prompt): - def assemble_responses(self, prompts_json, batch_size, use_thread, - responses_per_prompt): - - with open (prompts_json, "r") as f: + with open(prompts_json, "r") as f: golden_lines = [json.loads(l) for l in f.readlines()] # Group json lines together by prompt to avoid weirdness in @@ -72,12 +84,12 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, for entry in golden_lines: # 'summary' is the name of the 'prompt' field, i.e., the one to group on. if entry["summary"] in grouped_golden: - if 'email' in entry: + if "email" in entry: has_goldens = True grouped_golden[entry["summary"]]["goldens"].append(entry["email"]) else: grouped_golden[entry["summary"]] = {} - if 'email' in entry: + if "email" in entry: has_goldens = True grouped_golden[entry["summary"]]["goldens"] = [(entry["email"])] grouped_golden[entry["summary"]]["thread"] = entry["thread"] @@ -86,30 +98,38 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, all_responses = [] for i in tqdm(range(0, len(grouped_golden), batch_size)): - batch = grouped_golden[i:i + batch_size] + batch = grouped_golden[i : i + batch_size] prompts = [item[0] for item in batch] if use_thread: threads = [item[1]["thread"] for item in batch] golden_responses = [item[1]["goldens"] for item in batch] - responses = [{"prompt": p, - "full_prompt": None, - "thread": None if not use_thread else threads[i], - "golden_responses": golden_responses[i], - "panza_responses": []} for i, p in enumerate(prompts)] + responses = [ + { + "prompt": p, + "full_prompt": None, + "thread": None if not use_thread else threads[i], + "golden_responses": golden_responses[i], + "panza_responses": [], + } + for i, p in enumerate(prompts) + ] for _ in range(responses_per_prompt): if use_thread: instructions = list(zip(prompts, threads)) else: - instructions = list(zip(prompts, [None]*len(prompts))) + instructions = list(zip(prompts, [None] * len(prompts))) - outputs, full_prompts = self.writer.run_batch([EmailInstruction(user_input) for user_input in instructions], return_prompt=True) + outputs, full_prompts = self.writer.run_batch( + [EmailInstruction(user_input) for user_input in instructions], + return_prompt=True, + ) # Remove some boilerplate added by instruction-tuned models w/out finetuning. outputs = [o.replace("Here is the email:\n", "") for o in outputs] - outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] + outputs = [re.sub(r"SUBJECT:.*\n", "", o) for o in outputs] + outputs = [re.sub(r"Subject:.*\n", "", o) for o in outputs] + outputs = [re.sub(r"E-MAIL CONTENT:.*\n", "", o) for o in outputs] for i, r in enumerate(responses): r["full_prompt"] = full_prompts[i] @@ -121,46 +141,52 @@ def do_compute_metrics(self, all_responses): for response in all_responses: response["scores"] = {} response["scores"]["BLEU"] = compute_bleu_scores( - response["panza_responses"], - response["golden_responses"]) + response["panza_responses"], response["golden_responses"] + ) response["scores"]["ROUGE"] = compute_rouge_scores( - response["panza_responses"], - response["golden_responses"]) + response["panza_responses"], response["golden_responses"] + ) rouge_categories = all_responses[0]["scores"]["ROUGE"][0].keys() aggregate_metrics = { "BLEU": np.mean([s for r in all_responses for s in r["scores"]["BLEU"]]), - "ROUGE": {cat: - np.mean([ - s[cat] for r in all_responses for s in r["scores"]["ROUGE"]]) - for cat in rouge_categories}, - "MAUVE": compute_mauve_score([r["panza_responses"] for r in all_responses], - [r["golden_responses"] for r in all_responses]).mauve + "ROUGE": { + cat: np.mean([s[cat] for r in all_responses for s in r["scores"]["ROUGE"]]) + for cat in rouge_categories + }, + "MAUVE": compute_mauve_score( + [r["panza_responses"] for r in all_responses], + [r["golden_responses"] for r in all_responses], + ).mauve, } print("########## Aggregated quality metrics ##########\n") print(json.dumps(aggregate_metrics, indent=2)) return {"responses": all_responses, "aggregate_metrics": aggregate_metrics} - - def __init__(self, writer: PanzaWriter, - checkpoint: str, - panza_workspace: str, - input_file: str, - batch_size: int, - use_thread: bool, - responses_per_prompt: int, - compute_metrics: bool, - username: str): + def __init__( + self, + writer: PanzaWriter, + checkpoint: str, + panza_workspace: str, + input_file: str, + batch_size: int, + use_thread: bool, + responses_per_prompt: int, + compute_metrics: bool, + username: str, + ): self.writer = writer - responses, has_goldens = self.assemble_responses(input_file, batch_size, - use_thread, responses_per_prompt) + responses, has_goldens = self.assemble_responses( + input_file, batch_size, use_thread, responses_per_prompt + ) if compute_metrics: if has_goldens: responses = self.do_compute_metrics(responses) else: - print("Warning: metrics requested but no golden labels given!", - "\nDumping responses without computing metrics.") + print( + "Warning: metrics requested but no golden labels given!", + "\nDumping responses without computing metrics.", + ) - output_path = self.compose_output_folder( - input_file, checkpoint, panza_workspace, username) - with open(output_path, 'w') as f: + output_path = self.compose_output_folder(input_file, checkpoint, panza_workspace, username) + with open(output_path, "w") as f: json.dump(responses, f, indent=4, sort_keys=True) From c13aa07cb3961788ab786815b37e30a74945d6d3 Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Fri, 8 Nov 2024 17:59:06 +0100 Subject: [PATCH 098/112] Fix function address --- README_panza3.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_panza3.md b/README_panza3.md index 38a9dcd..c2c1d09 100644 --- a/README_panza3.md +++ b/README_panza3.md @@ -185,7 +185,7 @@ CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-
If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use.

- A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's load_datasets where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml. + A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml. From fee7cbd4012ebc6274ffc24bf32bd9f668ea9eb2 Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Tue, 12 Nov 2024 16:13:27 +0100 Subject: [PATCH 099/112] Clean up code TODOs and revert to defaults --- prompt_preambles/user_preamble.txt | 7 +++++-- scripts/prepare_data.py | 3 +-- src/panza3/finetuning/train.py | 2 +- src/panza3/interface/json.py | 8 ++++++++ src/panza3/llm/local.py | 1 - src/panza3/writer.py | 1 - 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/prompt_preambles/user_preamble.txt b/prompt_preambles/user_preamble.txt index a68751a..cbc16a5 100644 --- a/prompt_preambles/user_preamble.txt +++ b/prompt_preambles/user_preamble.txt @@ -2,5 +2,8 @@ # better help Panza write on their behalf. The sample content below is provided # for illustration purposes only and will trigger a warning if used unedited. -My name is Jen Iofinova. - +[CHANGE ME] My name is Jane Doe. I work as a manager at Acme Corp. +My address is 123 Main Street, Springfield, IL, USA. +My boss's name is Alex Burns. My children's names are Elsa, Anna, and Olaf. +I am deeply committed to my hobby of underwater basket weaving, for which +we meet every Thursday at noon. \ No newline at end of file diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 1b94e39..62bd033 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -57,7 +57,6 @@ def generate_synthetic_instructions( for i in tqdm(range(0, len(documents), batch_size)): print(f"--> Processing batch {i // batch_size + 1}/{num_batches}") batch = documents[i : i + batch_size] - # TODO: Rename .email to .content instructions = [ SummarizationInstruction(instruction=document.email) for document in batch ] @@ -66,7 +65,7 @@ def generate_synthetic_instructions( num_processed_documents += len(summaries) for it, summary in enumerate(summaries): - # TODO: Add cleaning and filtering + # Considerf adding cleaning and filtering here. batch[it].summary = summary # Write the summarized documents to a file diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index 2d85aff..5276281 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -759,7 +759,7 @@ def main(cfg: DictConfig) -> Trainer: # Log number of parameters if hasattr(model, "n_total_params"): n_params = model.n_total_params - n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. + n_trainable_params = n_params # We currently assume all parameters are trainable. else: n_params = sum(p.numel() for p in model.parameters()) n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/src/panza3/interface/json.py b/src/panza3/interface/json.py index 85efb73..3023ee6 100644 --- a/src/panza3/interface/json.py +++ b/src/panza3/interface/json.py @@ -11,6 +11,14 @@ from torchmetrics.text.bleu import BLEUScore from torchmetrics.text.rouge import ROUGEScore import string +import nltk + +# Ensure that tokenizer has been downloaded to ensure script does not fail. +try: + nltk.find("tokenizers/punkt_tab") +except: + print("punkt_tab was not downloaded. Installing.") + nltk.download("punkt_tab") punc_table = str.maketrans({key: None for key in string.punctuation}) rouge = ROUGEScore() diff --git a/src/panza3/llm/local.py b/src/panza3/llm/local.py index 20bc656..f887e39 100644 --- a/src/panza3/llm/local.py +++ b/src/panza3/llm/local.py @@ -51,7 +51,6 @@ def __init__( self.dtype = dtype self.load_in_4bit = load_in_4bit - # TODO: Add conditional import for BitsAndBytesConfig? self.quantization_config = ( BitsAndBytesConfig( load_in_4bit=True, diff --git a/src/panza3/writer.py b/src/panza3/writer.py index 885c59e..312e4a3 100644 --- a/src/panza3/writer.py +++ b/src/panza3/writer.py @@ -5,7 +5,6 @@ from .prompting import PromptBuilder -# TODO: Check that instruction type is compatible with prompt_builder type? class PanzaWriter: def __init__(self, prompt_builder: PromptBuilder, llm: LLM): self.prompt_builder = prompt_builder From 99f5c7cd82f80940b65a45d5cd3daa9e63a432f1 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 16:18:35 +0100 Subject: [PATCH 100/112] move top-level README to default location --- README.md | 104 +++++++++++------------- README_panza3.md | 205 ----------------------------------------------- 2 files changed, 49 insertions(+), 260 deletions(-) delete mode 100644 README_panza3.md diff --git a/README.md b/README.md index 975fca7..787a777 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Its main features are as follows: -## Prerequisites +## TODO: Prerequisites - Your emails, exported to `mbox` format (see tutorial below). - A computer, preferably with a NVIDIA GPU with at least 24 GiB of memory (alternatively, check out [running in Google Colab](#cloud-try-out-panza-in-google-colab)). - A Hugging Face [account](https://huggingface.co/login) to download the models (free of charge). @@ -62,30 +62,21 @@ The overall structure of Panza is as follows: ### Conda 1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. -2. Run `source prepare_env.sh`. This script will create a conda environment named `panza` and install the required packages. - -### Docker -As an alternative to the conda option above, you can run the following commands to pull a docker image with all the dependencies installed. -``` -docker pull istdaslab/panzamail -``` - -or alternatively, you can build the image yourself: -``` -docker build . -f Dockerfile -t istdaslab/panzamail -``` - -Then run it with: -``` -docker run -it --gpus all istdaslab/panzamail /bin/bash +2. Create a new conda environment named 'panza' (or something else) and activate it: +``` bash +conda create -n panza python=3.10 -y +conda activate panza ``` - -In the docker you can activate the `panza` environment with: +3. Install the required packages: +``` bash +pip install . ``` -micromamba activate panza +4. If you want to also finetune models using Panza, you will need to install the additional packages: +``` bash +pip install .[training] ``` -## :rocket: Getting started +## TODO: :rocket: Getting started To quickly get started with building your own personalized email assistant, follow the steps bellow: @@ -118,16 +109,26 @@ At the end of this step you should have the downloaded emails placed inside `dat ### Step 1: Environment configuration -Panza is configured through a set of environment variables defined in `scripts/config.sh` and shared along all running scripts. +Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized under the main functionalities of the code. +Note that these task-specific configs can, in some cases, be used to override base configs. + Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) - -The LLM prompt is controlled by a set of `prompt_preambles` that give the model more insight about its role, the user and how to reuse existing emails for *Retrieval-Augmented Generation (RAG)*. See more details in the [prompting section](prompt_preambles/README.md). +1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). +1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` +1. Serving: Serving consists of two parts - a serving infrastructure (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. + +These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: - - Modifiy the environment variable `PANZA_EMAIL_ADDRESS` inside `scripts/config.sh` with your own email address. - - Modifiy `prompt_preambles/user_preamble.txt` with your own information. If you choose, this can even be empty. +- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. +- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. +- Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. + + +Additionally, please perform the following login steps to be able to download the base model. - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. - - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `PANZA_WANDB_DISABLED=False` in `scripts/config.sh`. + - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `wandb_disabled=false` in `configs/finetuning/base.yaml`. + You are now ready to move to `scripts`. ``` bash @@ -137,63 +138,56 @@ cd scripts ### Step 2: Extract emails -1. Run `./extract_emails.sh`. This extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - -2. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. - -### Step 3: Prepare dataset - - -1. Simply run `./prepare_dataset.sh`.
+1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
This scripts takes care of all the prerequisites before training (expand for details). + - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`.
-### Step 4: Train a LLM on your emails - +ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. + +### Step 3: Train a LLM on your emails + We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. 1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. -2. We have prepopulated the training scripts with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `LR`, `LORA_LR`, `NUM_EPOCHS`. All the trained models are saved in the `checkpoints` directory. +2. We have prepopulated the training configs with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `lr`, `lora_lr`, `num_epochs`. All the trained models are saved in the `checkpoints` directory. Examples: ``` bash ./train_rosa.sh # Will use the default parameters. -./train_rosa.sh LR=1e-6 LORA_LR=1e-6 NUM_EPOCHS=7 # Will override LR, LORA_LR, and NUM_EPOCHS. +./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. ``` +
+ FAQs. + The bash scripts that are used to execute the finetuning procedure assume by default that your username is what is returned by the whoami command. This is used to locate the name of the user configs inside the configs/user directory as above. If you directly modified default.yaml, or created another yaml file where the name of that file does not match with the output of whoami, there will be an error. This is an easy fix. You can either: +
    +
  1. Change the name of the yaml file to be the output of whoami. +
  2. You can override the username manually when you launch the bash script by adding user=x where x is the name of the yaml file you created. For example: ./train_rosa.sh user=alonso +
+
+ If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. +
+ ### Step 5: Launch Panza! -1. Run `./run_panza_gui.sh MODEL=` to serve the trained model in a friendly GUI. -Alternatively, if you prefer using the CLI to interact with Panza, run `./run_panza_cli.sh` instead. - -You can experiment with the following arguments: -- If `MODEL` is not specified, it will use a pretrained `Meta-Llama-3-8B-Instruct` model by default, although Panza also works with `Mistral-7B-Instruct-v2`. Try it out to compare the syle difference! -- To disable RAG, run with `PANZA_DISABLE_RAG_INFERENCE=1`. +- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. +- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? -Example: -``` bash -./run_panza_gui.sh \ - MODEL=/local/path/to/this/repo/checkpoints/models/panza-rosa_1e-6-seed42_7908 \ - PANZA_DISABLE_RAG_INFERENCE=0 # this is the default behaviour, so you can omit it -``` :email: **Have fun with your new email writing assistant!** :email: -## :cloud: Try out Panza in Google Colab - -- You can run Panza in a Google Colab instance [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IST-DASLab/PanzaMail/blob/main/notebooks/panza_colab.ipynb). - ## :microscope: Advanced usage - [Data Preparation Guide](./scripts/README.md#data-guide) diff --git a/README_panza3.md b/README_panza3.md deleted file mode 100644 index 787a777..0000000 --- a/README_panza3.md +++ /dev/null @@ -1,205 +0,0 @@ -
- panza demo -
- -# Panza: A personal email assistant, trained and running on-device - - - -## What is Panza? - - - - -Panza is an automated email assistant customized to your writing style and past email history. \ -Its main features are as follows: -* Panza produces a fine-tuned LLM that matches your writing style, pairing it with a Retrieval-Augmented Generation (RAG) component which helps it produce relevant emails. -* Panza **can be trained and run entirely locally**. Currently, it requires a single GPU with -16-24 GiB of memory, but we also plan to release a CPU-only version. **At no point in training or execution is your data shared with the entities that trained the original LLMs, with LLM distribution services such as Huggingface, or with us.** -* Training and execution are also quick - for a dataset on the order of 1000 emails, training Panza takes well under an hour, and generating a new email takes a few seconds at most. - -
- panza logo -
- - -## TODO: Prerequisites -- Your emails, exported to `mbox` format (see tutorial below). -- A computer, preferably with a NVIDIA GPU with at least 24 GiB of memory (alternatively, check out [running in Google Colab](#cloud-try-out-panza-in-google-colab)). -- A Hugging Face [account](https://huggingface.co/login) to download the models (free of charge). -- [Optional] A Weights & Biases [account](https://wandb.ai/login) to log metrics during training (free of charge). -- Basic Python and Unix knowledge, such as building environments and running python scripts. -- *No prior LLMs experience is needed*. - - -## How it works - -### :film_projector: Step 1: Data playback - -For most email clients, it is possible to download a user's past emails in a machine-friendly .mbox format. For example, GMail allows you to do this via [Google Takeout](https://takeout.google.com), whereas Thunderbird allows one to do this via various plugins. - -One key part of Panza is a dataset-generation technique we call **data playback**: Given some of your past emails in .mbox format, we automatically create a training set for Panza by using a pretrained LLM to summarize the emails in instruction form; each email becomes a `(synthetic instruction, real email)` pair. -Given a dataset consisting of all pairs, we use these pairs to "play back" your sent emails: the LLM receives only the instruction, and has to generate the "ground truth" email as a training target. - -We find that this approach is very useful for the LLM to "learn" the user's writing style. - - -### :weight_lifting: Step 2: Local Fine-Tuning via Robust Adaptation (RoSA) - -We then use parameter-efficient finetuning to train the LLM on this dataset, locally. We found that we get the best results with the [RoSA method](https://arxiv.org/pdf/2401.04679.pdf), which combines low-rank (LoRA) and sparse finetuning. If parameter efficiency is not a concern, that is, you have a more powerful GPU, then regular, full-rank/full-parameter finetuning can also be used. We find that a moderate amount of further training strikes the right balance between matching the writer's style without memorizing irrelevant details in past emails. - - -### :owl: Step 3: Serving via RAG - -Once we have a custom user model, Panza can be run locally together with a Retrieval-Augmented Generation (RAG) module. Specifically, this functionality stores past emails in a database and provides a few relevant emails as context for each new query. This allows Panza to better insert specific details, such as a writer's contact information or frequently used Zoom links. - -The overall structure of Panza is as follows: -
- panza logo -
- -## Installation - -### Conda -1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. -2. Create a new conda environment named 'panza' (or something else) and activate it: -``` bash -conda create -n panza python=3.10 -y -conda activate panza -``` -3. Install the required packages: -``` bash -pip install . -``` -4. If you want to also finetune models using Panza, you will need to install the additional packages: -``` bash -pip install .[training] -``` - -## TODO: :rocket: Getting started - -To quickly get started with building your own personalized email assistant, follow the steps bellow: - - - - -### Step 0: Download your sent emails - -
- Expand for detailed download instructions. - - We provide a description for doing this for GMail via Google Takeout. - - 1. Go to [https://takeout.google.com/](https://takeout.google.com/). - 2. Click `Deselect all`. - 3. Find `Mail` section (search for the phrase `Messages and attachments in your Gmail account in MBOX format`). - 4. Select it. - 5. Click on `All Mail data included` and deselect everything except `Sent`. - 6. Scroll to the bottom of the page and click `Next step`. - 7. Click on `Create export`. - 8. Wait for download link to arrive in your inbox. - 9. Download `Sent.mbox` and place it in the `data/` directory. - - For Outlook accounts, we suggest doing this via a Thunderbird plugin for exporting a subset of your email as an MBOX format, such as [this add-on](https://addons.thunderbird.net/en-us/thunderbird/addon/importexporttools-ng/). -
- -At the end of this step you should have the downloaded emails placed inside `data/Sent.mbox`. - - -### Step 1: Environment configuration - - -Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized under the main functionalities of the code. -Note that these task-specific configs can, in some cases, be used to override base configs. - Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) - -1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). -1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` -1. Serving: Serving consists of two parts - a serving infrastructure (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. - - -These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. -:warning: Before continuing, make sure you complete the following setup: -- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. -- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. -- Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. - - -Additionally, please perform the following login steps to be able to download the base model. - - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. - - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `wandb_disabled=false` in `configs/finetuning/base.yaml`. - - -You are now ready to move to `scripts`. -``` bash -cd scripts -``` - -### Step 2: Extract emails - - -1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
- This scripts takes care of all the prerequisites before training (expand for details). - - - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. - - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`. -
- -ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. - -### Step 3: Train a LLM on your emails - - -We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. - -1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. -If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. - -2. We have prepopulated the training configs with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `lr`, `lora_lr`, `num_epochs`. All the trained models are saved in the `checkpoints` directory. - -Examples: -``` bash -./train_rosa.sh # Will use the default parameters. - -./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. -``` -
- FAQs. - The bash scripts that are used to execute the finetuning procedure assume by default that your username is what is returned by the whoami command. This is used to locate the name of the user configs inside the configs/user directory as above. If you directly modified default.yaml, or created another yaml file where the name of that file does not match with the output of whoami, there will be an error. This is an easy fix. You can either: -
    -
  1. Change the name of the yaml file to be the output of whoami. -
  2. You can override the username manually when you launch the bash script by adding user=x where x is the name of the yaml file you created. For example: ./train_rosa.sh user=alonso -
-
- If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. -
- - -### Step 5: Launch Panza! - - -- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. -- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? - - -:email: **Have fun with your new email writing assistant!** :email: - - - - -## :microscope: Advanced usage -- [Data Preparation Guide](./scripts/README.md#data-guide) -- [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) -- [Prompt Preambles Tutorial](prompt_preambles/README.md) - -## Authors - -Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): - -Dan Alistarh, Eugenia Iofinova, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, and Nir Shavit. - -Contact: dan.alistarh@ist.ac.at - -We thank our collaborators Michael Goin and Tony Wang at NeuralMagic and MIT for their helpful testing and feedback. From 1238d3957a495dccb86dffe7af4c6e46daa31746 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 16:53:29 +0100 Subject: [PATCH 101/112] update the scripts/ readme --- scripts/README.md | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index bc77e78..d058937 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -8,21 +8,48 @@ This directory contains all scripts necessary to train and run Panza. We provide * `config.sh` sets the necessary environment variables and other parameters used throughout the Panza workflow. This script should be edited by the user in several places: to set the user's email address (for data preprocessing), to select the LLM used for data summarization and Panza finetuning, and optionally to update the locations the data and models will be stored. #### Data preparation -* `extract_emails.sh` extracts the user's emails from the `.mbox` file and removes any unusable ones (such as email forwards, those that seem to be written in a foreign language, or those that are too short). -* `prepare_dataset.sh` automatically converts emails to training data by using an LLM to write their summaries in the form of prompts; it then splits them into train and test data, and prepares the RAG database. +* `prepare_data.py` does several things: + +1. Extracts the user's emails from the `.mbox` file and removes any unusable ones (such as email forwards, those that seem to be written in a foreign language, or those that are too short). +1. Automatically converts emails to training and test data by using an LLM to write their summaries in the form of prompts. +1. Optionally, splits the summarized into train and test data. This is not done by default because we expect most users to use the default hyperparameters, and therefore have no need for evaluation. To activate this feature, indicate the size of the test split as follows: `python ./prepare_data.py test_split=0.2` +1. Prepares the RAG database. Note that only train data is used for this step. #### Training -* `train_rosa.sh` performs [parameter-efficient training](https://arxiv.org/pdf/2401.04679.pdf), and evaluation. For evaluation, we use a heldout email dataset and compute the BLEU score between the output email and the one originally written by the user. -* `train_fft.sh` performs full-parameter/full-rank training, and then evaluation (as before). _Note that this requires additional computational resources (about 2x)._ +* `train_rosa.sh` performs [parameter-efficient training](https://arxiv.org/pdf/2401.04679.pdf). +* `train_fft.sh` performs full-parameter/full-rank training. _Note that this requires additional computational resources (about 2x)._ + + +#### Inference/Serving + +Serving is done through the `runner` object. To use the runner, the type of model and the type of interface must be specified. + +For interfaces, we offer serving via CLI (command-line inference) and an online GUI (via Gradio), as well as a bulk-serving API via JSON for the JSON, the location of the file defaults to the test data, but can be overridden (see the "evaluation" section, below). + +Currently, we support full-finetuned and parameter-efficienty-finetuned models. These must be set through the `writer-llm` parameter. +* To serve a foundation (i.e., not locally-finetuned) model or a fully-finetuned model, set `writer/llm=transformers` +* To serve a PEFT model, set `writer/llm=peft` + +Thus, a serving command would look something like: + +``` +python runner.py user=[username] interfaces=[cli|gui] writer/llm=[peft|transformers] checkpoint=[checkpoint_loc] +``` + +For the json interface, it would look like: + +``` +python runner.py user=[username] interfaces=json writer/llm=[peft|transformers] checkpoint=[checkpoint_loc] interfaces.input_file=[json_file_loc] +``` + +##### Evaluation -#### Serving -* `run_panza_cli.sh` runs a simple tool in the command line that enables a user to put in prompts and get Panza responses. -* `run_panza_gui.sh` runs a simple tool in the browser that enables a user to put in prompts and get Panza responses. +We think of evaluation as a special form of bulk inference/serving. Thus, like other forms of inference, it is done through a runner, specifically through the `json` interface. -Both of these tools require a link to the model that you wish to use. Running without providing a `MODEL` argument will run inference on the base (non-finetuned) LLM. +A sample command that runs interface over the test set looks like: ``` -./run_panza_gui.sh MODEL= +python runner.py user=jen interfaces=json writer/llm=[peft|transformers] checkpoint=[checkpoint_loc] interfaces.input_file=../data/test.jsonl ``` From c0a94a387331745a8c5f977d58e37ce161f5c8cb Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 16:54:50 +0100 Subject: [PATCH 102/112] remove useless assert --- src/panza3/finetuning/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/panza3/finetuning/train.py b/src/panza3/finetuning/train.py index ecdd038..1af6f4a 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza3/finetuning/train.py @@ -262,7 +262,6 @@ def build_composer_peft_model( bnb_4bit_quant_type='nf4', ) elif weight_bias_dtype == 'bf16': - assert weight_bias_dtype == 'bf16', 'Only bf16 is supported for now' compute_dtype = torch.bfloat16 quant_config = None else: From 3e2203cdd999dbb4e0d28ee2d1dcfe4cd346403f Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 17:00:35 +0100 Subject: [PATCH 103/112] Merge changes. --- README.md | 2 +- README_old.md | 234 ++++ configs/panza_preparation.yaml | 2 +- configs/user/default.yaml | 2 +- configs/user/jen.yaml | 9 + scripts/eval_base_model.sh | 102 ++ scripts/eval_model.sh | 34 + scripts/eval_rosa.sh | 114 ++ scripts/finetune_simple.py | 1016 +++++++++++++++++ scripts/output.tx | 0 .../evaluation/.evaluate_summaries.py.swp | Bin 0 -> 16384 bytes src/panza/evaluation/evaluate | 180 +++ src/panza/evaluation/evaluate_backup.py | 180 +++ src/panza/utils/documents.py | 46 + .../data_preparation/prepare_raft_emails.py | 92 ++ src/panza3/data_preparation/split_data.py | 43 + .../data_preparation/summarize_emails.py | 202 ++++ src/panza3/interface/gui_b.py | 31 + src/panza3/utils/documents.py | 46 + src/panza3/utils/prompting.py | 175 +++ src/panza3/utils/rag.py | 37 + 21 files changed, 2544 insertions(+), 3 deletions(-) create mode 100644 README_old.md create mode 100644 configs/user/jen.yaml create mode 100755 scripts/eval_base_model.sh create mode 100755 scripts/eval_model.sh create mode 100755 scripts/eval_rosa.sh create mode 100644 scripts/finetune_simple.py create mode 100644 scripts/output.tx create mode 100644 src/panza/evaluation/.evaluate_summaries.py.swp create mode 100644 src/panza/evaluation/evaluate create mode 100644 src/panza/evaluation/evaluate_backup.py create mode 100644 src/panza/utils/documents.py create mode 100644 src/panza3/data_preparation/prepare_raft_emails.py create mode 100644 src/panza3/data_preparation/split_data.py create mode 100644 src/panza3/data_preparation/summarize_emails.py create mode 100644 src/panza3/interface/gui_b.py create mode 100644 src/panza3/utils/documents.py create mode 100644 src/panza3/utils/prompting.py create mode 100644 src/panza3/utils/rag.py diff --git a/README.md b/README.md index 787a777..d9824d7 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ Examples: Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): -Dan Alistarh, Eugenia Iofinova, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, and Nir Shavit. +Dan Alistarh, Eugenia Iofinova, Andrej Jovanovic, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, Nir Shavit, and Sean Yang. Contact: dan.alistarh@ist.ac.at diff --git a/README_old.md b/README_old.md new file mode 100644 index 0000000..c2c1d09 --- /dev/null +++ b/README_old.md @@ -0,0 +1,234 @@ +
+ panza demo +
+ +# Panza: A personal email assistant, trained and running on-device + + + +## What is Panza? + + + + +Panza is an automated email assistant customized to your writing style and past email history. \ +Its main features are as follows: +* Panza produces a fine-tuned LLM that matches your writing style, pairing it with a Retrieval-Augmented Generation (RAG) component which helps it produce relevant emails. +* Panza **can be trained and run entirely locally**. Currently, it requires a single GPU with +16-24 GiB of memory, but we also plan to release a CPU-only version. **At no point in training or execution is your data shared with the entities that trained the original LLMs, with LLM distribution services such as Huggingface, or with us.** +* Training and execution are also quick - for a dataset on the order of 1000 emails, training Panza takes well under an hour, and generating a new email takes a few seconds at most. + +
+ panza logo +
+ + +## TODO: Prerequisites +- Your emails, exported to `mbox` format (see tutorial below). +- A computer, preferably with a NVIDIA GPU with at least 24 GiB of memory (alternatively, check out [running in Google Colab](#cloud-try-out-panza-in-google-colab)). +- A Hugging Face [account](https://huggingface.co/login) to download the models (free of charge). +- [Optional] A Weights & Biases [account](https://wandb.ai/login) to log metrics during training (free of charge). +- Basic Python and Unix knowledge, such as building environments and running python scripts. +- *No prior LLMs experience is needed*. + + +## How it works + +### :film_projector: Step 1: Data playback + +For most email clients, it is possible to download a user's past emails in a machine-friendly .mbox format. For example, GMail allows you to do this via [Google Takeout](https://takeout.google.com), whereas Thunderbird allows one to do this via various plugins. + +One key part of Panza is a dataset-generation technique we call **data playback**: Given some of your past emails in .mbox format, we automatically create a training set for Panza by using a pretrained LLM to summarize the emails in instruction form; each email becomes a `(synthetic instruction, real email)` pair. +Given a dataset consisting of all pairs, we use these pairs to "play back" your sent emails: the LLM receives only the instruction, and has to generate the "ground truth" email as a training target. + +We find that this approach is very useful for the LLM to "learn" the user's writing style. + + +### :weight_lifting: Step 2: Local Fine-Tuning via Robust Adaptation (RoSA) + +We then use parameter-efficient finetuning to train the LLM on this dataset, locally. We found that we get the best results with the [RoSA method](https://arxiv.org/pdf/2401.04679.pdf), which combines low-rank (LoRA) and sparse finetuning. If parameter efficiency is not a concern, that is, you have a more powerful GPU, then regular, full-rank/full-parameter finetuning can also be used. We find that a moderate amount of further training strikes the right balance between matching the writer's style without memorizing irrelevant details in past emails. + + +### :owl: Step 3: Serving via RAG + +Once we have a custom user model, Panza can be run locally together with a Retrieval-Augmented Generation (RAG) module. Specifically, this functionality stores past emails in a database and provides a few relevant emails as context for each new query. This allows Panza to better insert specific details, such as a writer's contact information or frequently used Zoom links. + +The overall structure of Panza is as follows: +
+ panza logo +
+ +## Installation + +### Conda +1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. +2. Create a new conda environment named 'panza' (or something else) and activate it: +``` bash +conda create -n panza python=3.10 -y +conda activate panza +``` +3. Install the required packages: +``` bash +pip install . +``` +4. If you want to also finetune models using Panza, you will need to install the additional packages: +``` bash +pip install .[training] +``` + +## TODO: :rocket: Getting started + +To quickly get started with building your own personalized email assistant, follow the steps bellow: + + + + +### Step 0: Download your sent emails + +
+ Expand for detailed download instructions. + + We provide a description for doing this for GMail via Google Takeout. + + 1. Go to [https://takeout.google.com/](https://takeout.google.com/). + 2. Click `Deselect all`. + 3. Find `Mail` section (search for the phrase `Messages and attachments in your Gmail account in MBOX format`). + 4. Select it. + 5. Click on `All Mail data included` and deselect everything except `Sent`. + 6. Scroll to the bottom of the page and click `Next step`. + 7. Click on `Create export`. + 8. Wait for download link to arrive in your inbox. + 9. Download `Sent.mbox` and place it in the `data/` directory. + + For Outlook accounts, we suggest doing this via a Thunderbird plugin for exporting a subset of your email as an MBOX format, such as [this add-on](https://addons.thunderbird.net/en-us/thunderbird/addon/importexporttools-ng/). +
+ +At the end of this step you should have the downloaded emails placed inside `data/Sent.mbox`. + + +### Step 1: Environment configuration + + +Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized under the main functionalities of the code. +Note that these task-specific configs can, in some cases, be used to override base configs. + Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) + +1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). +1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` +1. Serving: Serving consists of two parts - a serving infrastructure (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. + + +These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. +:warning: Before continuing, make sure you complete the following setup: +- Copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. If you modify the default yaml, you will need specify `user=default` as an extra flag in the succeeding steps. +- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. +- Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. + + +Additionally, please perform the following login steps to be able to download the base model. + - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. + - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `wandb_disabled=false` in `configs/finetuning/base.yaml`. + + +You are now ready to move to `scripts`. +``` bash +cd scripts +``` + +### Step 2: Extract emails + + +1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
+ This scripts takes care of all the prerequisites before training (expand for details). + + - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. + - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. + - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. + - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`. +
+**NB**: if you did not change the default configuration in `user/default.yaml` to reflect your particulars but rather created a new file, you need to add the additional flag to the above command where you specify `user=x` where your config file was named `x.yaml`. + +
+ FAQs. + When running the above script, you may encounter an OutOfMemoryError. If this is the case, you can either: +
    +
  1. Reduce the batch size for the data processing step. This can be found in configs/panza_preparation.yaml. +
  2. Move to a machine that has more memory. +
+
+ +ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. + +### Step 3: Train a LLM on your emails + + +We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. + +1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. +If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. + +2. We have prepopulated the training configs with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `lr`, `lora_lr`, `num_epochs`. All the trained models are saved in the `checkpoints` directory. + +Examples: +``` bash +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh # Will use the default parameters. + +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. +``` +
+ FAQs. + The bash scripts that are used to execute the finetuning procedure assume by default that your username is what is returned by the whoami command. This is used to locate the name of the user configs inside the configs/user directory as above. If you directly modified default.yaml, or created another yaml file where the name of that file does not match with the output of whoami, there will be an error. This is an easy fix. You can either: +
    +
  1. Change the name of the yaml file to be the output of whoami. +
  2. You can override the username manually when you launch the bash script by adding user=x where x is the name of the yaml file you created. For example: ./train_rosa.sh user=alonso +
+
+ If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. +

+ A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml. +
+ + +### Step 5: Launch Panza! + + +- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. +- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? + + +:email: **Have fun with your new email writing assistant!** :email: + + + + +## :microscope: Advanced usage +- [Data Preparation Guide](./scripts/README.md#data-guide) +- [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) +- [Prompt Preambles Tutorial](prompt_preambles/README.md) + +## :woman_technologist: Contributing +If you liked our work and want to contribute to improve the system, please feel free to do so! Make a _fork_ of our repository and once you have made your changes, submit a pull request so that we can review! + +One thing to mention: we want to make sure that we all adhere to the same coding standards, so we have added Black, a code formatter, as a prehook. To ensure that all your files are formatted with Black, do the following: + +1. Install the necessary dependencies +``` +pip install .[contributing] +``` + +2. Run the precommit command +``` +pre-commit install +``` + +3. Continue adding code as usual. All your code will be formatted by Black before commiting! + +## Authors + +Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): + +Dan Alistarh, Eugenia Iofinova, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, and Nir Shavit. + +Contact: dan.alistarh@ist.ac.at + +We thank our collaborators Michael Goin and Tony Wang at NeuralMagic and MIT for their helpful testing and feedback. diff --git a/configs/panza_preparation.yaml b/configs/panza_preparation.yaml index 72134fa..550e732 100644 --- a/configs/panza_preparation.yaml +++ b/configs/panza_preparation.yaml @@ -12,7 +12,7 @@ summarized_emails_path: ${user.data_dir}/${user.username}_emails_clean_summarize rag_db_dir: ${user.data_dir} -checkpoint: "microsoft/Phi-3-mini-4k-instruct" +checkpoint: "microsoft/Phi-3-mini-4k-instruct" force_extract_clean_emails: false # If false, data will not be recreated if it already exists. # Parameters for train-test split, if required. diff --git a/configs/user/default.yaml b/configs/user/default.yaml index 0581579..4899d3a 100644 --- a/configs/user/default.yaml +++ b/configs/user/default.yaml @@ -1,5 +1,5 @@ email_address: "abc@xyz.com" # Change this to your email address! -username: "abc" # TODO(armand): Use custom resolver to extract username from email address. +username: "jen.iofinova" # TODO(armand): Use custom resolver to extract username from email address. data_dir: ${panza_workspace}/data diff --git a/configs/user/jen.yaml b/configs/user/jen.yaml new file mode 100644 index 0000000..f2b4203 --- /dev/null +++ b/configs/user/jen.yaml @@ -0,0 +1,9 @@ +email_address: "jen.iofinova@gmail.com" # Change this to your email address! +username: "jen.iofinova" # TODO(armand): Use custom resolver to extract username from email address. + +data_dir: ${panza_workspace}/data + +system_preamble_path: ${panza_workspace}/prompt_preambles/system_preamble.txt +user_preamble_path: ${panza_workspace}/prompt_preambles/user_preamble.txt +rag_preamble_path: ${panza_workspace}/prompt_preambles/rag_preamble.txt +thread_preamble_path: ${panza_workspace}/prompt_preambles/thread_preamble.txt \ No newline at end of file diff --git a/scripts/eval_base_model.sh b/scripts/eval_base_model.sh new file mode 100755 index 0000000..9d7f97a --- /dev/null +++ b/scripts/eval_base_model.sh @@ -0,0 +1,102 @@ +set -e + +source config.sh + +current_user=$(whoami) + +export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl + +# hyper-parameters with default values +export MASK_GEN_MODEL_PRECISION=${MODEL_PRECISION} # bf16, fp32, or 4bit +export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the checkpoints and generated masks +export NUM_EPOCHS=5 +export WARMUP=8 # the learning rate warmup (batches) +export BS=8 +export PER_DEVICE_BS=1 +export LORA_ALPHA=16 +export SCHEDULE=wl16 # the RoSA schedule +export SPA_NUM_GRADS=1 # number of gradients used for mask generation +export SPA_GRAD_ACC_MODE=mean_squared # 'mean' or 'mean_squared': how to accumulate gradients +export SEED=${PANZA_SEED} + + +export PANZA_RAG_RELEVANCE_THRESHOLD=0 # emails whose relevance is above this threshold will be presented for RAG + +if [[ ${MODEL_TYPE} == llama3 ]]; then + export LR=1e-5 # learning rate + export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters +elif [[ ${MODEL_TYPE} == mistralv2 ]]; then + export LR=1e-5 # learning rate + export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters +else + echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2 and llama3 models. Exiting." + exit +fi + +echo "Using Learning Rate ${LR} and LoRA LR ${LORA_LR} for ${MODEL_TYPE} model" + + +# hyper-parameters without default values +export SPA_DENSITY=0.01 # the sparse adapters' density +export LORA_R=8 # the low-rank adapters' rank + +export WANDB_PROJECT="panza-${current_user}" +export PRETRAINED=${PANZA_GENERATIVE_MODEL} +export CONFIG=${PANZA_FINETUNE_CONFIGS}/rosa_panza.yaml +export NUM_CPU_THREADS=0 # useful for running of CPU, 0 means default the used by torch + +export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" # if not set, default to 0 + +# take all the input arguments and put them in environment variables +# this could override the hyper-parameters defined above +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + +if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then + PREAMBLE_STR="PREAMBLE" + PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble +else + PREAMBLE_STR="" + PREPROCESSING_FN=panza.finetuningpreprocessing:panza_preprocessing_function +fi + +# some post-processing on the inputs + + + + +echo $RUN_NAME +# Running BLEU evaluation +EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py +# python ${EVAL_SCRIPT} \ +# --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ +# --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ +# --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ +# --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ +# --golden=${PANZA_DATA_DIR}/test.jsonl \ +# --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ +# --wandb-run-id=${WANDB_RUN_ID} \ +# ${USE_4BIT_QUANT} + + #--model=/nfs/scistore19/alistgrp/eiofinov/.cache/huggingface/hub/models--ISTA-DASLab--Meta-Llama-3-8B-Instruct/snapshots/0e6f530447ceec1aea4fd96e2aafad06bb3aa4b5/ \ +# Running BLEU evaluation with RAG +python ${EVAL_SCRIPT} \ + --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ + --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ + --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ + --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ + --golden=${PANZA_DATA_DIR}/test.jsonl \ + --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ + --wandb-run-id=${WANDB_RUN_ID} \ + --embedding-model=${PANZA_EMBEDDING_MODEL} \ + --db-path=${PANZA_DATA_DIR} \ + --index-name=${PANZA_USERNAME} \ + --use-rag \ + ${USE_4BIT_QUANT} diff --git a/scripts/eval_model.sh b/scripts/eval_model.sh new file mode 100755 index 0000000..7e06e85 --- /dev/null +++ b/scripts/eval_model.sh @@ -0,0 +1,34 @@ +# Convenience script for running full finetuning. +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# ./train_fft.sh user=alonso trainer.optimizer.lr=0.1 + +set -e + +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + elif [[ $key == finetuning ]]; then + echo "The 'finetuning' argument is already set and should not be overridden here; override is ignored." + else + vars[idx]=$argument + idx+=1 + fi +done + +composer ../src/panza3/finetuning/train.py \ + finetuning=full ${vars[@]} \ No newline at end of file diff --git a/scripts/eval_rosa.sh b/scripts/eval_rosa.sh new file mode 100755 index 0000000..285df55 --- /dev/null +++ b/scripts/eval_rosa.sh @@ -0,0 +1,114 @@ +set -e + +source config.sh + +current_user=$(whoami) + +export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl + +# hyper-parameters with default values +export MASK_GEN_MODEL_PRECISION=${MODEL_PRECISION} # bf16, fp32, or 4bit +export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the checkpoints and generated masks +export NUM_EPOCHS=5 +export WARMUP=8 # the learning rate warmup (batches) +export BS=8 +export PER_DEVICE_BS=1 +export LORA_ALPHA=16 +export SCHEDULE=wl16 # the RoSA schedule +export SPA_NUM_GRADS=1 # number of gradients used for mask generation +export SPA_GRAD_ACC_MODE=mean_squared # 'mean' or 'mean_squared': how to accumulate gradients +export SEED=${PANZA_SEED} + +if [[ ${MODEL_TYPE} == llama3 ]]; then + export LR=1e-5 # learning rate + export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters +elif [[ ${MODEL_TYPE} == mistralv2 ]]; then + export LR=1e-5 # learning rate + export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters +else + echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2 and llama3 models. Exiting." + exit +fi + +echo "Using Learning Rate ${LR} and LoRA LR ${LORA_LR} for ${MODEL_TYPE} model" + + +# hyper-parameters without default values +export SPA_DENSITY=0.01 # the sparse adapters' density +export LORA_R=8 # the low-rank adapters' rank + +export WANDB_PROJECT="panza-${current_user}" +export PRETRAINED=${PANZA_GENERATIVE_MODEL} +export CONFIG=${PANZA_FINETUNE_CONFIGS}/rosa_panza.yaml +export NUM_CPU_THREADS=0 # useful for running of CPU, 0 means default the used by torch + +export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" # if not set, default to 0 + +# take all the input arguments and put them in environment variables +# this could override the hyper-parameters defined above +for ARGUMENT in "$@" +do + KEY=$(echo $ARGUMENT | cut -f1 -d=) + + KEY_LENGTH=${#KEY} + VALUE="${ARGUMENT:$KEY_LENGTH+1}" + + export "$KEY"="$VALUE" +done + +if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then + PREAMBLE_STR="PREAMBLE" + PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble +else + PREAMBLE_STR="" + PREPROCESSING_FN=panza.finetuningpreprocessing:panza_preprocessing_function +fi + +# some post-processing on the inputs +export MAX_DURATION=${NUM_EPOCHS}ep + +# create directories to save the masks and models +mkdir -p ${BASE_SAVE_PATH}/masks/ +mkdir -p ${BASE_SAVE_PATH}/models/ + +if [ "$MODEL_PRECISION" = "bf16" ]; then + export ROSA_DTYPE=bf16 +elif [ "$MODEL_PRECISION" = "4bit" ]; then + export ROSA_DTYPE=fp32 +elif [ "$MODEL_PRECISION" = "fp32" ]; then + export ROSA_DTYPE=fp32 +else + echo "Unknown model precision $MODEL_PRECISION" + exit 1 +fi + + + +# Running BLEU evaluation +EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py +python ${EVAL_SCRIPT} \ + --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ + --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ + --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ + --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ + --golden=${PANZA_DATA_DIR}/test.jsonl \ + --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ + --wandb-run-id=${WANDB_RUN_ID} \ + ${USE_4BIT_QUANT} + +# # Running BLEU evaluation with RAG +# python ${EVAL_SCRIPT} \ +# --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ +# --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ +# --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ +# --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ +# --golden=${PANZA_DATA_DIR}/test.jsonl \ +# --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ +# --wandb-run-id=${WANDB_RUN_ID} \ +# --embedding-model=${PANZA_EMBEDDING_MODEL} \ +# --db-path=${PANZA_DATA_DIR} \ +# --index-name=${PANZA_USERNAME} \ +# --use-rag \ +# ${USE_4BIT_QUANT} + +echo "find the adapter at ${BASE_SAVE_PATH}/models/${RUN_NAME}" diff --git a/scripts/finetune_simple.py b/scripts/finetune_simple.py new file mode 100644 index 0000000..35c68eb --- /dev/null +++ b/scripts/finetune_simple.py @@ -0,0 +1,1016 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +import copy +import gc +import logging +import os +import random +import sys +import tempfile +import time +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import torch +from composer import Trainer +from composer.core.callback import Callback +from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, + InContextLearningLMExpectedCalibrationError, + InContextLearningMCExpectedCalibrationError, + InContextLearningMultipleChoiceAccuracy, + InContextLearningQAAccuracy, LanguageCrossEntropy, + LanguagePerplexity) +from composer.optim import DecoupledAdamW +from composer.profiler import JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule +from composer.utils import dist, get_device, reproducibility +from llmfoundry import ComposerHFCausalLM +from llmfoundry.eval.metrics.nlp import InContextLearningMetric +from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP +from llmfoundry.models.utils import init_empty_weights +from llmfoundry.utils import find_mosaicml_logger, log_train_analytics, maybe_create_mosaicml_logger +from omegaconf import DictConfig, ListConfig +from omegaconf import OmegaConf as om +from peft import get_peft_model +from peft.tuners.rosa import RosaConfig, RosaModel, RosaScheduler +from rich.traceback import install +from torch.distributed.fsdp import FullStateDictConfig +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import StateDictType +from transformers import AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerBase + +install() +from llmfoundry.callbacks import AsyncEval +from llmfoundry.data.dataloader import build_dataloader +from llmfoundry.layers_registry import ffns_with_megablocks +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_algorithm, build_callback, + build_composer_model, build_evaluators, build_logger, + build_optimizer, build_scheduler, build_tokenizer) +from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, + update_batch_size_info) +from llmfoundry.utils.registry_utils import import_file + +import hydra +from omegaconf import DictConfig, OmegaConf + +from panza3 import PanzaWriter # The import also loads custom Hydra resolvers + +log = logging.getLogger(__name__) + + +def validate_config(cfg: DictConfig): + """Validates compatible model and dataloader selection.""" + loaders = [cfg.train_loader] + if 'eval_loader' in cfg: + eval_loader = cfg.eval_loader + if isinstance(eval_loader, ListConfig): + for loader in eval_loader: + if loader.label is None: + raise ValueError( + 'When specifying multiple evaluation datasets, each one must include the \ + `label` attribute.') + loaders.append(loader) + else: + loaders.append(eval_loader) + for loader in loaders: + if loader.name == 'text': + if cfg.model.name == 'hf_t5': + raise ValueError( + f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ + f'dataloader. Only finetuning is supported.') + + if 'icl_tasks' in cfg: + if cfg.model.name == 'hf_t5': + raise ValueError( + 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' + ) + + if (cfg.model.get('fc_type', 'torch') != 'te' and 'te' not in cfg.model.get( + 'ffn_config', {}).get('ffn_type', 'mptmlp') and + 'fp8' in cfg.precision): + warnings.warn( + "fp8 only supported for te.Linear layers. Either set `cfg.model.fc_typ='te'` or " + + + "`cfg.model.ffn_config.ffn_type='te_ln_mlp'` to enable layers using fp8 precision." + ) + + if (cfg.model.get('fc_type', 'torch') == 'te' or + 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): + fsdp_config = cfg.get('fsdp_config', None) + act_ckpt = fsdp_config.get('activation_checkpointing', False) + act_ckpt_reentrant = fsdp_config.get( + 'activation_checkpointing_reentrant', False) + if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: + warnings.warn( + '`te.Linear` layers do not support activation_checkpointing with ' + + '`activation_checkpointing_reentrant = True`. ' + + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' + ) + cfg.fsdp_config.activation_checkpointing_reentrant = False + + if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': + warnings.warn( + '`te.LayerNormMLP` requires has issues with torch._dynamo. ' + + 'Setting `torch._dynamo.config.suppress_errors = True` and falling back to eager.' + ) + torch._dynamo.config.suppress_errors = True # type: ignore (third-party) + + if cfg.model.get('load_in_8bit', False): + raise ValueError( + '`load_in_8bit` is only supported for evaluation rather than training.' + ) + + if cfg.model.get('ffn_config', {}).get('ffn_type', + 'mptmlp') in ffns_with_megablocks: + moe_world_size = cfg.model.get('ffn_config', + {}).get('moe_world_size', 1) + use_orig_params = cfg.get('fsdp_config', + {}).get('use_orig_params', True) + if moe_world_size > 1 and not use_orig_params: + raise ValueError( + f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' + ) + + +def create_run_name(cfg: DictConfig) -> str: + # export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM + + run_name = f"panza_{cfg.user.username}" + + model_name = cfg.model.split("/")[-1] + run_name += f"-{model_name}" + + run_name += f"-{cfg.model_precision}" + run_name += f"-bs{cfg.batch_size}" + + if hasattr(cfg.finetuning, "rosa"): + run_name += "-rosa" + else: + run_name += "-fft" + + run_name += f"-lr{cfg.lr}" + run_name += f"-epochs{cfg.num_epochs}" + run_name += f"-seed{cfg.seed}" + run_name += f"-{random.randint(1e6, 1e7 - 1)}" + + return run_name + + +def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: + # Disable struct mode to allow modifications + rosa_cfg = cfg.finetuning.rosa + OmegaConf.set_struct(rosa_cfg, False) + + mask_path = str(Path(cfg.checkpoint_dir) / "masks" / cfg.finetuning.run_name) + + if mask_generation: + rosa_cfg.schedule = "wl16" if rosa_cfg.lora_r != 0 else "spa_only" + rosa_cfg.mask_load_path = None + rosa_cfg.mask_save_path = mask_path + rosa_cfg.terminate_after_mask_generation = True + else: + if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "default" + elif rosa_cfg.lora_r != 0: + rosa_cfg.schedule = "lora_only" + rosa_cfg.mask_load_path = None + else: + rosa_cfg.schedule = "spa_only" + + rosa_cfg.mask_load_path = mask_path + rosa_cfg.mask_save_path = None + rosa_cfg.terminate_after_mask_generation = None + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(rosa_cfg, True) + + +def create_experiment_yaml() -> str: + pass + + +def create_checkpoint_dirs(cfg: DictConfig) -> None: + # Create model directory + os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) + + # Create mask directory + if hasattr(cfg.finetuning, "rosa"): + os.makedirs(os.path.join(cfg.checkpoint_dir, "masks"), exist_ok=True) + + +def get_hf_save_precision(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bfloat16" + elif cfg.model_precision == "fp32": + return "float32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def get_rosa_dtype(cfg: DictConfig) -> str: + if cfg.model_precision == "bf16": + return "bg16" + elif cfg.model_precision == "fp32": + return "fp32" + elif cfg.model_precision == "4bit": + return "fp32" + else: + raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") + + +def override_config(cfg: DictConfig) -> None: + # Disable struct mode to allow modifications + OmegaConf.set_struct(cfg, False) + + cfg.finetuning.run_name = create_run_name(cfg) + + if hasattr(cfg.finetuning, "rosa"): + cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) + else: + cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) + + # Re-enable struct mode to lock down the configuration + OmegaConf.set_struct(cfg, True) + + +def save_config_to_yaml(cfg: DictConfig) -> str: + cfg = OmegaConf.to_container(cfg, resolve=True) + with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yaml") as temp_file: + OmegaConf.save(config=cfg, f=temp_file.name) + return temp_file.name + + +def build_composer_peft_model( + model_config: str, rosa_config: Dict[str, Any], + tokenizer: PreTrainedTokenizerBase, is_fsdp: bool = False) -> ComposerHFCausalLM: + + # 1) loads a hf model, 2) adds peft modules, 3) wraps it in a ComposerHFCausalLM. + print('Building model from HuggingFace checkpoint...') + + weight_bias_dtype = model_config.get('weight_bias_dtype', None) + if weight_bias_dtype == '4bit': + compute_dtype = torch.bfloat16 + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4', + ) + elif weight_bias_dtype == 'bf16': + assert weight_bias_dtype == 'bf16', 'Only bf16 is supported for now' + compute_dtype = torch.bfloat16 + quant_config = None + else: + assert weight_bias_dtype == 'fp32' + compute_dtype = torch.float32 + quant_config = None + + with init_empty_weights(include_buffers=False): + model = AutoModelForCausalLM.from_pretrained( + model_config.pretrained_model_name_or_path, + device_map='cpu' if quant_config is None else 'auto', + torch_dtype=compute_dtype, + # load_in_4bit=weight_bias_dtype == '4bit', + quantization_config=quant_config, + trust_remote_code=True, + use_auth_token=True, + use_cache=False, + attn_implementation='eager' + ) + + print('Model built!') + if rosa_config is not None: + print('Building RoSA config...') + config = RosaConfig( + r=rosa_config['lora_r'], + d=rosa_config['spa_d'], + lora_alpha=rosa_config.get('lora_alpha', 16), + target_modules=rosa_config.get('target_modules', 'all-linear'), + lora_dropout=rosa_config.get('lora_dropout', 0.05), + impl=rosa_config.get('impl', 'auto'), + spa_store_transpose=rosa_config.get('spa_store_transpose', True), + rosa_dtype=rosa_config.get('rosa_dtype', True), + spa_num_grads=rosa_config.get('spa_num_grads', 1), + grad_acc_mode=rosa_config.get('grad_acc_mode', 'mean_squared'), + grad_4bit_accum=rosa_config.get('grad_4bit_accum', False), + mask_load_path=rosa_config.get('mask_load_path', None), + mask_save_path=rosa_config.get('mask_save_path', None), + terminate_after_mask_generation=rosa_config.get('terminate_after_mask_generation', False), + schedule=rosa_config.get('schedule', 'df'), + bias="none", + task_type="CAUSAL_LM", + ) + print('Adding RoSA modules...') + model = get_peft_model(model, config) + print('RoSA modules added!') + + train_metrics = [LanguageCrossEntropy(), LanguagePerplexity()] + eval_metrics = [ + LanguageCrossEntropy(), + LanguagePerplexity(), + InContextLearningLMAccuracy(), + InContextLearningMultipleChoiceAccuracy(), + InContextLearningQAAccuracy(), + InContextLearningCodeEvalAccuracy(), + InContextLearningLMExpectedCalibrationError(), + InContextLearningMCExpectedCalibrationError() + ] + + model = HuggingFaceModelWithFSDP( + model=model, + shift_labels=True, + tokenizer=tokenizer, + metrics=train_metrics, + eval_metrics=eval_metrics, + init_device='cpu', + peft_config=None + ) + + # model = ComposerHFCausalLM(model, tokenizer) + # model = ModelComposerHFCausalLM(model, tokenizer) + return model + +@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_finetuning") +def main(cfg: DictConfig) -> Trainer: + override_config(cfg) + + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + + create_checkpoint_dirs(cfg) + environment = os.environ + #environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") + environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" + environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) + environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml + environment["TOKENIZERS_PARALLELISM"] = "False" + + cfg = cfg.finetuning + + print("config is") + print(cfg) + OmegaConf.set_struct(cfg, False) + + + # Run user provided code if specified + code_paths = pop_config(cfg, + 'code_paths', + must_exist=False, + default_value=[], + convert=True) + # Import any user provided code + for code_path in code_paths: + import_file(code_path) + + # Filter deprecation warning from torch internal usage + warnings.filterwarnings( + action='ignore', + category=UserWarning, + message= + 'torch.distributed.*_base is a private function and will be deprecated.*' + ) + + # Check for incompatibilities between the model and data loaders + validate_config(cfg) + + # Resolve all interpolation variables as early as possible + om.resolve(cfg) + + # Create copy of config for logging + logged_cfg: DictConfig = copy.deepcopy(cfg) + + cuda_alloc_conf = [] + # Get max split size mb + max_split_size_mb: Optional[int] = cfg.pop('max_split_size_mb', None) + if max_split_size_mb is not None: + cuda_alloc_conf.append(f'max_split_size_mb:{max_split_size_mb}') + + # Expandable segments + if cfg.pop('expandable_segments', False): + cuda_alloc_conf.append('expandable_segments:True') + + if len(cuda_alloc_conf) > 0: + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(cuda_alloc_conf) + + # Set CUDA lazy loading + # This can save a bit of memory if not all modules are needed + cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False) + if cuda_load_lazy: + os.environ['CUDA_MODULE_LOADING'] = 'LAZY' + + # Set seed first + seed: int = pop_config(cfg, 'seed', must_exist=True) + reproducibility.seed_all(seed) + + # Initialize pytorch distributed training process groups + dist_timeout: Union[int, float] = pop_config(cfg, + 'dist_timeout', + must_exist=False, + default_value=600.0) + dist.initialize_dist(get_device(None), timeout=dist_timeout) + + # Get global and device batch size information from distributed/single node setting + cfg = update_batch_size_info(cfg) + logged_cfg.update(cfg, merge=True) + + # Mandatory model training configs + model_config: DictConfig = pop_config(cfg, 'model', must_exist=True) + tokenizer_config: Dict[str, Any] = pop_config(cfg, + 'tokenizer', + must_exist=True, + convert=True) + optimizer_config: Dict[str, Any] = pop_config(cfg, + 'optimizer', + must_exist=True, + convert=True) + scheduler_config: Dict[str, Any] = pop_config(cfg, + 'scheduler', + must_exist=True, + convert=True) + train_loader_config: DictConfig = pop_config(cfg, + 'train_loader', + must_exist=True) + + # Optional fsdp data, fine-tuning, and eval configs + fsdp_config: Optional[Dict[str, Any]] = pop_config(cfg, + 'fsdp_config', + must_exist=False, + default_value=None, + convert=True) + + ds_config: Optional[Dict[str, Any]] = pop_config(cfg, + 'ds_config', + must_exist=False, + default_value=None, + convert=True) + + rosa_config: Optional[Dict[str, Any]] = pop_config(cfg, + 'rosa', + must_exist=False, + default_value=None, + convert=True) + + hf_save_path: Union[int, str] = pop_config(cfg, + 'hf_save_path', + must_exist=True) + + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( + cfg, 'eval_loader', must_exist=False, default_value=None) + icl_tasks_config: Optional[Union[ListConfig, + str]] = pop_config(cfg, + 'icl_tasks', + must_exist=False, + default_value=None) + eval_gauntlet_config: Optional[Union[DictConfig, + str]] = pop_config(cfg, + 'eval_gauntlet', + must_exist=False, + default_value=None) + icl_subset_num_batches: Optional[int] = pop_config(cfg, + 'icl_subset_num_batches', + must_exist=False, + default_value=None) + icl_seq_len: Optional[int] = pop_config(cfg, + 'icl_seq_len', + must_exist=False, + default_value=None) + # Optional logging, evaluation and callback configs + logger_configs: Optional[DictConfig] = pop_config(cfg, + 'loggers', + must_exist=False, + default_value=None, + convert=True) + callback_configs: Optional[DictConfig] = pop_config(cfg, + 'callbacks', + must_exist=False, + default_value=None, + convert=True) + algorithm_configs: Optional[DictConfig] = pop_config(cfg, + 'algorithms', + must_exist=False, + default_value=None) + + # Mandatory hyperparameters for training + device_train_batch_size: int = pop_config(cfg, + 'device_train_batch_size', + must_exist=True) + device_eval_batch_size: int = pop_config(cfg, + 'device_eval_batch_size', + must_exist=True) + max_duration: Union[int, str] = pop_config(cfg, + 'max_duration', + must_exist=True) + eval_interval: Union[int, str] = pop_config(cfg, + 'eval_interval', + default_value=1, + must_exist=False) + precision: str = pop_config(cfg, 'precision', must_exist=True) + max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) + + # Optional parameters will be set to default values if not specified. + default_run_name: str = os.environ.get('RUN_NAME', 'llm') + run_name: str = pop_config(cfg, + 'run_name', + must_exist=False, + default_value=default_run_name) + save_folder: Optional[str] = pop_config(cfg, + 'save_folder', + must_exist=False, + default_value=None) + is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') + == 'sharded') if fsdp_config else False + save_latest_filename: str = pop_config( + cfg, + 'save_latest_filename', + must_exist=False, + default_value='latest-sharded-rank{rank}' + if is_state_dict_sharded else 'latest-rank{rank}.pt') + save_overwrite: bool = pop_config(cfg, + 'save_overwrite', + must_exist=False, + default_value=False) + save_weights_only: bool = pop_config(cfg, + 'save_weights_only', + must_exist=False, + default_value=False) + save_filename: str = pop_config( + cfg, + 'save_filename', + must_exist=False, + default_value='ep{epoch}-ba{batch}-rank{rank}.pt') + save_interval: Union[str, int] = pop_config(cfg, + 'save_interval', + must_exist=False, + default_value='1000ba') + save_num_checkpoints_to_keep: int = pop_config( + cfg, 'save_num_checkpoints_to_keep', must_exist=False, default_value=-1) + progress_bar = pop_config(cfg, + 'progress_bar', + must_exist=False, + default_value=False) + log_to_console: bool = pop_config(cfg, + 'log_to_console', + must_exist=False, + default_value=True) + python_log_level: Optional[str] = pop_config(cfg, + 'python_log_level', + must_exist=False, + default_value='debug') + console_log_interval: Union[int, str] = pop_config(cfg, + 'console_log_interval', + must_exist=False, + default_value='1ba') + device_train_microbatch_size: Union[str, int] = pop_config( + cfg, + 'device_train_microbatch_size', + must_exist=False, + default_value='auto') + eval_subset_num_batches: int = pop_config(cfg, + 'eval_subset_num_batches', + must_exist=False, + default_value=-1) + eval_first: bool = pop_config(cfg, + 'eval_first', + must_exist=False, + default_value=False) + load_path: str = pop_config(cfg, + 'load_path', + must_exist=False, + default_value=None) + load_weights_only: bool = pop_config(cfg, + 'load_weights_only', + must_exist=False, + default_value=False) + load_strict_model_weights: bool = pop_config(cfg, + 'load_strict_model_weights', + must_exist=False, + default_value=True) + load_ignore_keys: Optional[List[str]] = pop_config(cfg, + 'load_ignore_keys', + must_exist=False, + default_value=None) + save_ignore_keys: Optional[List[str]] = pop_config(cfg, + 'save_ignore_keys', + must_exist=False, + default_value=None) + compile_config: Optional[Dict[str, Any]] = pop_config(cfg, + 'compile_config', + must_exist=False, + default_value=None) + metadata: Optional[Dict[str, str]] = pop_config(cfg, + 'metadata', + must_exist=False, + default_value=None, + convert=True) + should_log_config: bool = pop_config(cfg, + 'log_config', + must_exist=False, + default_value=True) + + num_cpu_threads: Optional[int] = cfg.pop('num_cpu_threads', 0) + if num_cpu_threads > 0: + print(f'Setting number of CPU threads to {num_cpu_threads}') + import spops + torch.set_num_threads(num_cpu_threads) + spops.set_num_threads(num_cpu_threads) + + # Enable autoresume from model checkpoints if possible + autoresume_default: bool = False + if logged_cfg.get('run_name', None) is not None \ + and save_folder is not None \ + and not save_overwrite \ + and not save_weights_only: + autoresume_default = True + + if cfg.get('autoresume') is None and autoresume_default: + log.info('As run_name, save_folder, and save_latest_filename are set, \ + changing autoresume default to True...') + + autoresume: bool = pop_config(cfg, + 'autoresume', + must_exist=False, + default_value=autoresume_default) + + # Pop known unused parameters that are used as interpolation variables or + # created by update_batch_size_info. + pop_config(cfg, 'data_local', must_exist=False) + pop_config(cfg, 'data_remote', must_exist=False) + pop_config(cfg, 'global_seed', must_exist=False) + pop_config(cfg, 'global_train_batch_size', must_exist=False) + pop_config(cfg, 'n_gpus', must_exist=False) + pop_config(cfg, 'device_train_grad_accum', must_exist=False) + + assert fsdp_config is None or ds_config is None, 'fsdp and deepspeed are not supported together' + + # Warn users for unused parameters + for key in cfg: + warnings.warn( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' + ) + + # Warn if fsdp is enabled but user only has 1 GPU + if dist.get_world_size() == 1 and fsdp_config is not None: + warnings.warn( + 'FSDP is not applicable for single-GPU training. Reverting to DDP.') + fsdp_config = None + + # set logging level + if python_log_level is not None: + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + ) + logging.getLogger('llmfoundry').setLevel( + python_log_level.upper()) # Foundry module + logging.getLogger(__name__).setLevel( + python_log_level.upper()) # Train script + + # Initialize context + init_context = process_init_device(model_config, fsdp_config) + logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) + + # Build tokenizer + log.info('Building tokenizer...') + tokenizer_name = tokenizer_config['name'] + tokenizer_kwargs = tokenizer_config.get('kwargs', {}) + tokenizer_kwargs["num_proc"] = 1 + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + # Scheduler + scheduler_name: str = scheduler_config.pop('name') + scheduler = build_scheduler(scheduler_name, scheduler_config) + + # Loggers + loggers = [ + build_logger(str(name), logger_cfg) + for name, logger_cfg in logger_configs.items() + ] if logger_configs else [] + + mosaicml_logger = find_mosaicml_logger(loggers) + if mosaicml_logger is None: + mosaicml_logger = maybe_create_mosaicml_logger() + if mosaicml_logger is not None: + # mosaicml_logger will be None if run isn't on MosaicML platform + loggers.append(mosaicml_logger) + + if metadata is not None: + # Flatten the metadata for logging + logged_cfg.pop('metadata', None) + logged_cfg.update(metadata, merge=True) + if mosaicml_logger is not None: + mosaicml_logger.log_metrics(metadata) + mosaicml_logger._flush_metadata(force_flush=True) + + # Profiling + profiler: Optional[Profiler] = None + profiler_cfg: Optional[DictConfig] = pop_config(cfg, + 'profiler', + must_exist=False, + convert=False, + default_value=None) + if profiler_cfg: + profiler_schedule_cfg: Dict = pop_config(profiler_cfg, + 'schedule', + must_exist=True, + convert=True) + profiler_schedule = cyclic_schedule(**profiler_schedule_cfg) + # Only support json trace handler + profiler_trace_handlers: List[TraceHandler] = [] + profiler_trace_cfg: Optional[Dict] = pop_config(profiler_cfg, + 'json_trace_handler', + must_exist=False, + default_value=None, + convert=True) + if profiler_trace_cfg: + profiler_trace_handlers.append( + JSONTraceHandler(**profiler_trace_cfg)) + profiler = Profiler(**profiler_cfg, + trace_handlers=profiler_trace_handlers, + schedule=profiler_schedule) + + # Callbacks + callbacks: List[Callback] = [ + build_callback(str(name), callback_cfg, om.to_container(logged_cfg)) + for name, callback_cfg in callback_configs.items() + ] if callback_configs else [] + + use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) + + print('ROSA CONFIG', rosa_config) + # Build Model + print('Initializing model...') + with init_context: + assert fsdp_config is None or rosa_config is None, 'fsdp is cuurently not supported with RoSA' + model = build_composer_peft_model(model_config, rosa_config, tokenizer, is_fsdp=fsdp_config is not None) + if rosa_config is not None: + assert isinstance(model.model.base_model, RosaModel) + + # Algorithms + algorithms = [ + build_algorithm(str(name), algorithm_cfg) + for name, algorithm_cfg in algorithm_configs.items() + ] if algorithm_configs else [] + + if rosa_config is not None: + algorithms.append(RosaScheduler(model.model.base_model)) + + # Dataloaders + log.info('Building train loader...') + from datasets import disable_caching + from streaming.base.util import clean_stale_shared_memory + clean_stale_shared_memory() + try: + disable_caching() + + train_loader = build_dataloader( + train_loader_config, + tokenizer, + device_train_batch_size, + ) + except Exception as e: + if mosaicml_logger is not None: + mosaicml_logger.log_exception(e) + raise e + + if mosaicml_logger is not None: + mosaicml_logger.log_metrics({'data_validated': time.time()}) + + ## Evaluation + if use_async_eval: + evaluators = [] + if eval_first: + warnings.warn( + 'AsyncEval callback does not support eval_first=True. Ignoring.' + ) + eval_first = False + + else: + log.info('Building eval loader...') + eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len + evaluators, _, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks_config, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=eval_icl_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) + + if mosaicml_logger is not None: + log_train_analytics(mosaicml_logger, model_config, train_loader_config, + eval_loader_config, callback_configs, + tokenizer_name, load_path, icl_tasks_config, + eval_gauntlet_config) + # # Build Model + # log.info('Initializing model...') + # model = build_composer_model( + # name=model_config.name, + # cfg=model_config, + # tokenizer=tokenizer, + # init_context=init_context, + # master_weights_dtype=model_config.get('master_weights_dtype', None), + # ) + + # Log number of parameters + if hasattr(model, 'n_total_params'): + n_params = model.n_total_params + n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. + else: + n_params = sum(p.numel() for p in model.parameters()) + n_trainable_params = sum( + p.numel() for p in model.parameters() if p.requires_grad) + if hasattr(model, 'n_active_params'): + n_active_params = model.n_active_params + else: + n_active_params = n_params + logged_cfg.update({ + 'n_params': n_params, + 'n_active_params': n_active_params, + 'n_trainable_params': n_trainable_params, + }) + + # Optimizer + optimizer_name: str = optimizer_config.pop('name') + if rosa_config is None or 'lora_lr' not in rosa_config: + optimizer = build_optimizer(model, optimizer_name, optimizer_config) + else: + print(f'Using a different learning rate for lora params {rosa_config["lora_lr"]}') + assert optimizer_name == 'decoupled_adamw' + lora_params = [] + other_params = [] + for name, param in model.named_parameters(): + if any([k in name for k in ['rosa_A', 'rosa_B', 'rosa_embedding_A', 'rosa_embedding_B']]): + lora_params.append(param) + else: + other_params.append(param) + + print(f'Found {len(lora_params)} lora params and {len(other_params)} other params') + params = [ + {'params': other_params}, + {'params': lora_params, 'lr': rosa_config['lora_lr']} + ] + optimizer = DecoupledAdamW(params, **optimizer_config) + + + + # Now add the eval metrics + try: + if eval_loader_config is not None and not use_async_eval: + eval_metrics = model.get_metrics(is_train=False) + non_icl_metrics = [ + metric_name for metric_name, metric in eval_metrics.items() + if not isinstance(metric, InContextLearningMetric) + ] + evaluators = add_metrics_to_eval_loaders(evaluators, + non_icl_metrics) + except Exception as e: + if mosaicml_logger is not None: + mosaicml_logger.log_exception(e) + raise e + + # Build the Trainer + log.info('Building trainer...') + trainer = Trainer( + run_name=run_name, + seed=seed, + model=model, + train_dataloader=train_loader, + eval_dataloader=evaluators, + optimizers=optimizer, + schedulers=scheduler, + max_duration=max_duration, + eval_interval=eval_interval, + eval_subset_num_batches=eval_subset_num_batches, + progress_bar=progress_bar, + log_to_console=log_to_console, + console_log_interval=console_log_interval, + loggers=loggers, + callbacks=callbacks, + precision=precision, + algorithms=algorithms, + device_train_microbatch_size=device_train_microbatch_size, + fsdp_config=fsdp_config, + deepspeed_config=ds_config, + save_folder=save_folder, + save_filename=save_filename, + save_latest_filename=save_latest_filename, + save_interval=save_interval, + save_num_checkpoints_to_keep=save_num_checkpoints_to_keep, + save_overwrite=save_overwrite, + save_weights_only=save_weights_only, + load_path=load_path, + load_weights_only=load_weights_only, + load_strict_model_weights=load_strict_model_weights, + load_ignore_keys=load_ignore_keys, + save_ignore_keys=save_ignore_keys, + autoresume=autoresume, + python_log_level=python_log_level, + dist_timeout=dist_timeout, + profiler=profiler, + compile_config=compile_config, + ) + + if should_log_config: + log.info('Logging config') + log_config(logged_cfg) + torch.cuda.empty_cache() + gc.collect() + + # Eval first if requested + if eval_first and trainer.state.timestamp.batch.value == 0: + trainer.eval() + + log.info('Starting training...') + trainer.fit() + + # if rosa is enabled, save the model manually, since + # llm-foundry's checkpointing doesn't work properly with RoSA + if rosa_config is not None: + assert fsdp_config is None, 'fsdp is cuurently not supported with RoSA' + path_to_save = os.path.join(hf_save_path, run_name) + print(f'saving the model to {path_to_save}') + if torch.distributed.get_rank() == 0: + model.model.save_pretrained(path_to_save, is_main_process=True, state_dict=model.model.state_dict()) + tokenizer.save_pretrained(path_to_save) + + # print('Saving directly into HF-friendly format') + + # path_to_save = os.path.join(hf_save_path, run_name) + # print('saving the model.') + # if fsdp_config is None: + # model.model.save_pretrained(path_to_save, is_main_process=torch.distributed.get_rank() == 0, state_dict=model.model.state_dict()) + # else: + # with FSDP.summon_full_params(model.model, writeback=False, rank0_only=True, offload_to_cpu=True): + # model_to_save = model.model + # model_to_save.save_pretrained(path_to_save, state_dict=model_to_save.state_dict()) + + # if torch.distributed.get_rank() == 0: + # tokenizer.save_pretrained(path_to_save) + + # # NOTE: for some reason the saving code above would create empty pytorch_model.bin file, so we delete it manually + # # TODO: figure out why this happens + # if torch.distributed.get_rank() == 0 and os.path.exists(os.path.join(path_to_save, "pytorch_model.bin")): + # tmp = torch.load(os.path.join(path_to_save, "pytorch_model.bin")) + # if not tmp: # empty dict, remove it + # os.remove(os.path.join(path_to_save, "pytorch_model.bin")) + + log.info('Done.') + return trainer + + +PY = None +FY = None + +def do_thing(cfg:DictConfig) -> List[str]: + # Override configuration + override_config(cfg) + + create_checkpoint_dirs(cfg) + + # Launch training + print("HEEEeEre") + preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) + finetuning_yaml = save_config_to_yaml(cfg.finetuning) + print(preprocessing_yaml, finetuning_yaml) + PY = preprocessing_yaml + FY = finetuning_yaml + #return "hellooooo", "lol" + #return [preprocessing_yaml, finetuning_yaml] + + +if __name__ == '__main__': + # yaml_path, args_list = sys.argv[1], sys.argv[2:] + + # # Disable resolving environment variables through omegaconf. + # om.clear_resolver('oc.env') + + # #if get_local_rank == 0: + # log.info("Starting Panza Finetuning") + # print("hello") + # with open(yaml_path) as f: + # print("loading") + # cfg = om.load(f) + #raise ValueError(cfg) + #if os.getenv("LOCAL_RANK", '0') == "0": + # do_thing() + # raise ValueError(PY) + # print(preprocessing_yaml, finetuning_yaml) + # environment = os.environ.copy() + # environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") + # environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" + # environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) + # environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml + # sys.exit() + # #log.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) + # # Load yaml and cli arguments. + # with open(yaml_path) as f: + # yaml_cfg = om.load(f) + # cli_cfg = om.from_cli(args_list) + # cfg = om.merge(yaml_cfg, cli_cfg) + # om.resolve(cfg) + # assert isinstance(cfg, DictConfig) + main() diff --git a/scripts/output.tx b/scripts/output.tx new file mode 100644 index 0000000..e69de29 diff --git a/src/panza/evaluation/.evaluate_summaries.py.swp b/src/panza/evaluation/.evaluate_summaries.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..6787e235bdb151d90c2b5baaf71c4a24745a56ae GIT binary patch literal 16384 zcmeHOO^h5z6>fq_$R9)@_<#f|_eM;Qm}&2iiNnG~TE`n>ylXFx?T8GkPS14BY_mPx zov!L#j~5e(V-PBaa5WbM->a(bo|*kgKpYVC zNT0g9Ue&8tuU@^b_o}Bif9?D_`-D4h;rEbb{p^v?3|3xgAOH3vr!5{PgD_5Zh;-C1 z@Wtc9G+#Wb4$;0;Fg&^8$2a_SKa5U_tba14r;~igk8)pzNnG4=PvoPKpM_kw>GXG zf&V=X$n-Aj1q|yhGqmT-^<6X9cbn&yx%xA7^H^VM4b&Q_HBf7y)XGf&W4c_o9)=ZuWMd%tB#;9=mW@3X9DfKLH`c+j%;fV04p zz!_i>_$crp;342{;BOCD)?a|PfLp*zzz&cDJ>V+vY2X}i=YGq26L$|{r01tQqm;?TCw`Khmcme1GJ>W^;QQ#in=kKwsuK=G1y1;3G0k@7>)+@lbfG-0V z00tZfJ_y`G66Kq~3UC_uJCZK1178OcpaW1MhUVgSk$^Gt@eVHRfM3jaJwC#R7Ocbi zNji1|&QrQ^RB6+iZ4(ZdqI7L_u{T+#!`Lv10v>BlC1rW4@gT{js|qr?w{6NNE%h0Ma#X|x*6CZq1Dkh-O-*qoA1 z6@kaV&A1<^C#M?xZYZ}{lJeN`v!QT}yf=8DM=ZQS81WXfGrP%r!3J7R9tl1z@xF%W zamZty`4ZoTC8lERbr766q(rQ87NiR4cRga*~gfY zSk>}Gc!O8NCXIxoj?9mT+?j8&2m_qu(`?OhXfDHQ&2L&3QayzCPW3|BQT|D6RW0v|b z^A6in!=GS#)1I1yS#nAAh2WWF3BJYxCs*+njv^OIRVrek?)+jBLr6tWvoMzXjZe{L zO%ft@_eTD>$Gwcl{;2p2Z0u||mv_`k-)^y;%E7pyRGyjCJ*T%HaUxA@i9_;zY`DTdbE$HdNMH8Y*nflZkP~6SivWhI)dGZ7SQXx_5aT z2olb$Mj}H2sh`1;Va2$3RAyG6DffebO7jti%|hdPskCXW+AX%lqqJkM=+%S?q++HN zpzATouS+ulhcG0!*$(42IZ|+h1^Ia-J5TxWK=8-v&Csz0d=dBXrr<|&Q_jCl&L6y# z!^2uDUH7rmf=;;gnx2IN?QA$!GTl3y&QwOTc%+eksRv>wX^i%kxLK z*HhPq>{K%0aE3fmfG$`G6&{o?VQA}uz`bIOr7*>Rnnq#Q7*4)9xuw&4?6v z#IcF%3zrs++lzQ!4B*V#>eZ+FSOI$Y`zvQjKE!rU_j~!$)pM&8)y>DmC7o7WDI^A8 zNJU%7cvw84Gs7<_Ymjp)N@?R<+!xk>jrkmz3B7R0!AIEZ#gd8C$C^v9*NAB@;Mm5+ zI%nrLuCy0j(@cU9mjd+ymgF+crGN{-gL#Sev31HN2Qh}P#9pV(ulu7EKBwo4P`K<0 zy;Z_d7~w-{Ii_Ys6ZAmD^_8pFR+%7OvJv+Mwn~Bp*cW6Wye&4Nlg%A748dU-B>#oA zb^w12A>l-ttjFap$6f+MH3NV#!r&mF&{oN#aWS^IJ4ZtHS*Mg8HJCD(qmKeZc4#-kn!hps2`kj0g-4AEV2I; zm=zi*>9tt`d|`b9b&Io)p6N6Ja|ac zqFvY|5p1FQG!Y_%g_CkjC!1sW!0WIJG z;4Q@ZzX5&({1EVg4WI@57P0*ofj!_jKr#NSz>C1=fHrVHa363RG5j}xi@-W?8aM{r zMlAnTAO|Gi0&harH-J9^uK^_ct3ds$HBf7y)Xc61PDbO9e8-y5XabeV&|=Myx8_)f9(09ABIIhjyyH?l$}#L z2CZ@(rV)~`CM6W!(D_c2AaW%V0WH?pZK$-dsXmnvo0Ou;*{D&ez(~v2Rl7B$;t@Q# zB(c-Yx(!v>*i@Tbq1PaO5b>=;hXO8A1e zpW;LyaLOZ;O0C4cEUqSp{=mvrGIAoDTPhYQOQI|NQt0X_br-qIDw$g`wrT2gH9Ci3 z)#Gd@WgFBw@|Z`>y=IB3>d7esnwE)Os)1HLqIPL}RO)ogW}7Y6)uG3xK8GPX4YNx( z3kZF2S=5&$eL1xWBeh74^eZ``6PqEemtaAgQ>Y@>y6F~C5O>l%?t9V)+a`{Kc9Cmu zx+0B2m}e!r3B!=>oMovgCM`U+-zjIEJ*MeYqR{}0P_jHI(z9p`2&3uhMwP%QL)`6- z{B106cphgD>uH%$x=AOtu||opN=vxZ4uLX-t~ZUoNSCL;rA&`Fu}496;hl)SbP#<> zt47VmLG;CnzHktILDMOAMW7vpy$t#3BGn*N>SH3Jr3e*cijmluNDEM08YJpq0^vDg zX=Ja5IFrCS^COA00a=62>vUL^(lM)P4rHm*|P+48phhqa+z}rwUWxz! literal 0 HcmV?d00001 diff --git a/src/panza/evaluation/evaluate b/src/panza/evaluation/evaluate new file mode 100644 index 0000000..a56617a --- /dev/null +++ b/src/panza/evaluation/evaluate @@ -0,0 +1,180 @@ +# We conduct evaluations with three scores. +# The BLEU score is frequently used to evaluate translations and compares n-grams in a 'golden' +# translation to those in a candidate translation. Multiple golden translations are possible. +# The ROUGE score is frequently used for translation and summarization; it also looks at +# n-gram similarity. It is actually several scores, since precision, recall, and F1 score are +# reported separately. +# The MAUVE score measures distribution similarity (in the sense of KL-divergence) between the +# targets and outputs, and is not computed on a per-example basis. The similarity is computed +# in the latent space of an LLM, by default GPT-2. + + +import json +import os +import re +import string +import sys + +from evaluate import load +from torchmetrics.text.rouge import ROUGEScore +from torchmetrics.text.bleu import BLEUScore + +import numpy as np +import torch +import wandb + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.evaluation import base_inference +from panza.utils import prompting, rag + +sys.path.pop(0) + + +def main(): + parser = base_inference.get_base_inference_args_parser() + parser.add_argument("--responses-per-prompt", type=int, default=1) + parser.add_argument("--golden", type=str, default=None) + parser.add_argument("--batch-size", type=int, default=1) + parser.add_argument("--wandb-run-id", type=str, default=None) + args = parser.parse_args() + + rouge = ROUGEScore() + # This library computes the BLEU score components separately. We do not use a length penalty. + bleu1 = BLEUScore(n_gram=1) + bleu2 = BLEUScore(n_gram=2) + bleu3 = BLEUScore(n_gram=3) + bleu4 = BLEUScore(n_gram=4) + mauve = load('mauve') + + if args.nthreads is not None: + torch.set_num_threads(args.nthreads) + + print("Loading model ", args.model) + model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) + + if args.use_rag: + embeddings_model = rag.get_embeddings_model(args.embedding_model) + db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) + + system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( + args.system_preamble, args.user_preamble, args.rag_preamble + ) + + with open(args.golden, "r") as f: + golden_lines = [json.loads(l) for l in f.readlines()] + + grouped_golden = {} + for entry in golden_lines: + if entry["summary"] in grouped_golden: + grouped_golden[entry["summary"]]["templates"].append(entry["email"]) + else: + grouped_golden[entry["summary"]] = {} + grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] + + print("Evaluating with batch size", args.batch_size) + + results = {} + all_results = [] + prompt_scores = {} + outputs_logs = {} + grouped_golden = list(grouped_golden.items()) + for i in range(0, len(grouped_golden), args.batch_size): + batch = grouped_golden[i:i + args.batch_size] + prompts = [item[0] for item in batch] + golden_responses = [item[1]["templates"] for item in batch] + + #prompt_scores = [[] for _ in range(len(prompts))] + for _ in range(args.responses_per_prompt): + full_prompts, outputs = base_inference.run_inference( + instructions=prompts, + model=model, + tokenizer=tokenizer, + system_preamble=system_preamble, + user_preamble=user_preamble, + rag_preamble=rag_preamble, + rag_relevance_threshold=args.rag_relevance_threshold, + rag_num_emails=args.rag_num_emails, + use_rag=args.use_rag, + db=db if args.use_rag else None, + max_new_tokens=args.max_new_tokens, + best=args.best, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + device=args.device, + ) + + # Remove some boilerplate added by instruction-tuned models w/out finetuning. + outputs = [o.replace("Here is the email:\n", "") for o in outputs] + outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] + for j, prompt in enumerate(prompts): + # We clean up the strings for the BLEU and ROUGE scores. + punc_table = str.maketrans({key: None for key in string.punctuation}) + golden = [" ".join(x.translate(punc_table).lower().split()) for x in golden_responses[j]] + candidate = " ".join(outputs[j].translate(punc_table).lower().split()) + + rouge_score = rouge(outputs[j], golden_responses[j]) + bleu_score = np.mean([bleu([candidate], [golden]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) + rouge_score = rouge(candidate, golden) + if prompt not in prompt_scores.keys(): + prompt_scores[prompt] = {"prompt": prompt, "full_prompt": full_prompts[j], + "golden" : golden_responses[j], "output": [outputs[j]], + "BLEU": [bleu_score.item()]} + for score, value in rouge_score.items(): + prompt_scores[prompt][score] = [value.item()] + else: + prompt_scores[prompt]["output"].append(outputs[j]) + prompt_scores[prompt]["BLEU"].append(bleu_score.item()) + for score, value in rouge_score.items(): + prompt_scores[prompt][score].append(value.item()) + + print("\n-----------\n", "PROMPT:\n", prompt, "\n\nOUTPUT:\n", outputs[j], "\n\nBLEU SCORE:\n", bleu_score, "\n\nROUGE SCORE:\n", rouge_score) + + + means = {} + mins = {} + score_names = [k for k in prompt_scores.values().__iter__().__next__().keys() if 'BLEU' in k or 'rouge' in k] + + for k in score_names: + means[k] = np.mean([v for scores in prompt_scores.values() for v in scores[k] ]) + mins[k] = np.min([v for scores in prompt_scores.values() for v in scores[k] ]) + + # To compute the MAUVE score, we need equal-length flat arrays of + # outputs and goldens. If we have multiple outputs per prompt, we + # output them all, with the same golden prompt. + # TODO: not sure if it would be better to randomly sample from the + # outputs in this case. + # TODO: consider handling the case where there are also multiple golden + # queries per output. (We don't use this for anything now). + flattened_golden = [] + flattened_outputs = [] + for prompt_info in prompt_scores.values(): + flattened_golden += ([prompt_info["golden"][0]])*len(prompt_info['output']) + flattened_outputs += prompt_info['output'] + mauve_score = mauve.compute(predictions=flattened_outputs, references=flattened_golden) + print("MAUVE score", mauve_score) + means["MAUVE"] = mauve_score.mauve + print("Mean scores across all prompts: ", {f" {k}: {v}" for k, v in means.items()}) + + + # Optionally, update wandb run with eval scores + rag_str = "RAG-" if args.use_rag else "" + if args.wandb_run_id: + with wandb.init(id=args.wandb_run_id, resume=True): + wandb.log({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) + wandb.log({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) + else: + print({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) + print({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) + + with open(os.path.join(args.model, f"{rag_str}eval_responses.txt"), 'w') as f: + json.dump(prompt_scores, f, ensure_ascii=False, indent=4) + + with open(os.path.join(args.model, f"{rag_str}eval_summary.txt"), 'w') as f: + json.dump({"means": means, "mins": mins}, f, ensure_ascii=False, indent=4) + +if __name__ == "__main__": + main() diff --git a/src/panza/evaluation/evaluate_backup.py b/src/panza/evaluation/evaluate_backup.py new file mode 100644 index 0000000..a56617a --- /dev/null +++ b/src/panza/evaluation/evaluate_backup.py @@ -0,0 +1,180 @@ +# We conduct evaluations with three scores. +# The BLEU score is frequently used to evaluate translations and compares n-grams in a 'golden' +# translation to those in a candidate translation. Multiple golden translations are possible. +# The ROUGE score is frequently used for translation and summarization; it also looks at +# n-gram similarity. It is actually several scores, since precision, recall, and F1 score are +# reported separately. +# The MAUVE score measures distribution similarity (in the sense of KL-divergence) between the +# targets and outputs, and is not computed on a per-example basis. The similarity is computed +# in the latent space of an LLM, by default GPT-2. + + +import json +import os +import re +import string +import sys + +from evaluate import load +from torchmetrics.text.rouge import ROUGEScore +from torchmetrics.text.bleu import BLEUScore + +import numpy as np +import torch +import wandb + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.evaluation import base_inference +from panza.utils import prompting, rag + +sys.path.pop(0) + + +def main(): + parser = base_inference.get_base_inference_args_parser() + parser.add_argument("--responses-per-prompt", type=int, default=1) + parser.add_argument("--golden", type=str, default=None) + parser.add_argument("--batch-size", type=int, default=1) + parser.add_argument("--wandb-run-id", type=str, default=None) + args = parser.parse_args() + + rouge = ROUGEScore() + # This library computes the BLEU score components separately. We do not use a length penalty. + bleu1 = BLEUScore(n_gram=1) + bleu2 = BLEUScore(n_gram=2) + bleu3 = BLEUScore(n_gram=3) + bleu4 = BLEUScore(n_gram=4) + mauve = load('mauve') + + if args.nthreads is not None: + torch.set_num_threads(args.nthreads) + + print("Loading model ", args.model) + model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) + + if args.use_rag: + embeddings_model = rag.get_embeddings_model(args.embedding_model) + db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) + + system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( + args.system_preamble, args.user_preamble, args.rag_preamble + ) + + with open(args.golden, "r") as f: + golden_lines = [json.loads(l) for l in f.readlines()] + + grouped_golden = {} + for entry in golden_lines: + if entry["summary"] in grouped_golden: + grouped_golden[entry["summary"]]["templates"].append(entry["email"]) + else: + grouped_golden[entry["summary"]] = {} + grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] + + print("Evaluating with batch size", args.batch_size) + + results = {} + all_results = [] + prompt_scores = {} + outputs_logs = {} + grouped_golden = list(grouped_golden.items()) + for i in range(0, len(grouped_golden), args.batch_size): + batch = grouped_golden[i:i + args.batch_size] + prompts = [item[0] for item in batch] + golden_responses = [item[1]["templates"] for item in batch] + + #prompt_scores = [[] for _ in range(len(prompts))] + for _ in range(args.responses_per_prompt): + full_prompts, outputs = base_inference.run_inference( + instructions=prompts, + model=model, + tokenizer=tokenizer, + system_preamble=system_preamble, + user_preamble=user_preamble, + rag_preamble=rag_preamble, + rag_relevance_threshold=args.rag_relevance_threshold, + rag_num_emails=args.rag_num_emails, + use_rag=args.use_rag, + db=db if args.use_rag else None, + max_new_tokens=args.max_new_tokens, + best=args.best, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + device=args.device, + ) + + # Remove some boilerplate added by instruction-tuned models w/out finetuning. + outputs = [o.replace("Here is the email:\n", "") for o in outputs] + outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] + outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] + for j, prompt in enumerate(prompts): + # We clean up the strings for the BLEU and ROUGE scores. + punc_table = str.maketrans({key: None for key in string.punctuation}) + golden = [" ".join(x.translate(punc_table).lower().split()) for x in golden_responses[j]] + candidate = " ".join(outputs[j].translate(punc_table).lower().split()) + + rouge_score = rouge(outputs[j], golden_responses[j]) + bleu_score = np.mean([bleu([candidate], [golden]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) + rouge_score = rouge(candidate, golden) + if prompt not in prompt_scores.keys(): + prompt_scores[prompt] = {"prompt": prompt, "full_prompt": full_prompts[j], + "golden" : golden_responses[j], "output": [outputs[j]], + "BLEU": [bleu_score.item()]} + for score, value in rouge_score.items(): + prompt_scores[prompt][score] = [value.item()] + else: + prompt_scores[prompt]["output"].append(outputs[j]) + prompt_scores[prompt]["BLEU"].append(bleu_score.item()) + for score, value in rouge_score.items(): + prompt_scores[prompt][score].append(value.item()) + + print("\n-----------\n", "PROMPT:\n", prompt, "\n\nOUTPUT:\n", outputs[j], "\n\nBLEU SCORE:\n", bleu_score, "\n\nROUGE SCORE:\n", rouge_score) + + + means = {} + mins = {} + score_names = [k for k in prompt_scores.values().__iter__().__next__().keys() if 'BLEU' in k or 'rouge' in k] + + for k in score_names: + means[k] = np.mean([v for scores in prompt_scores.values() for v in scores[k] ]) + mins[k] = np.min([v for scores in prompt_scores.values() for v in scores[k] ]) + + # To compute the MAUVE score, we need equal-length flat arrays of + # outputs and goldens. If we have multiple outputs per prompt, we + # output them all, with the same golden prompt. + # TODO: not sure if it would be better to randomly sample from the + # outputs in this case. + # TODO: consider handling the case where there are also multiple golden + # queries per output. (We don't use this for anything now). + flattened_golden = [] + flattened_outputs = [] + for prompt_info in prompt_scores.values(): + flattened_golden += ([prompt_info["golden"][0]])*len(prompt_info['output']) + flattened_outputs += prompt_info['output'] + mauve_score = mauve.compute(predictions=flattened_outputs, references=flattened_golden) + print("MAUVE score", mauve_score) + means["MAUVE"] = mauve_score.mauve + print("Mean scores across all prompts: ", {f" {k}: {v}" for k, v in means.items()}) + + + # Optionally, update wandb run with eval scores + rag_str = "RAG-" if args.use_rag else "" + if args.wandb_run_id: + with wandb.init(id=args.wandb_run_id, resume=True): + wandb.log({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) + wandb.log({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) + else: + print({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) + print({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) + + with open(os.path.join(args.model, f"{rag_str}eval_responses.txt"), 'w') as f: + json.dump(prompt_scores, f, ensure_ascii=False, indent=4) + + with open(os.path.join(args.model, f"{rag_str}eval_summary.txt"), 'w') as f: + json.dump({"means": means, "mins": mins}, f, ensure_ascii=False, indent=4) + +if __name__ == "__main__": + main() diff --git a/src/panza/utils/documents.py b/src/panza/utils/documents.py new file mode 100644 index 0000000..ccd7f85 --- /dev/null +++ b/src/panza/utils/documents.py @@ -0,0 +1,46 @@ +import copy +import json +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import Dict, List, Optional, Union + + +@dataclass +class Document(ABC): + summary: Optional[str] = None + + @abstractmethod + def serialize(self) -> dict: + """Convert the document to a dictionary that can be serialized to JSON.""" + pass + + @classmethod + @abstractmethod + def deserialize(cls, data: Union[str, Dict]) -> "Document": + """Convert a serialized document into a Document object.""" + pass + + +@dataclass(kw_only=True) +class Email(Document): + email: str + subject: str + thread: List[str] + date: datetime + + def serialize(self) -> dict: + dictionary = asdict(self) + dictionary["date"] = self.date.isoformat() + return dictionary + + @classmethod + def deserialize(cls, data: Union[str, Dict]) -> "Email": + if isinstance(data, str): + dictionary = json.loads(data) + elif isinstance(data, dict): + dictionary = copy.deepcopy(data) + else: + raise ValueError(f"Cannot deserialize data of type {type(data)}. Must be str or dict.") + dictionary["date"] = datetime.fromisoformat(dictionary["date"]) + return cls(**dictionary) diff --git a/src/panza3/data_preparation/prepare_raft_emails.py b/src/panza3/data_preparation/prepare_raft_emails.py new file mode 100644 index 0000000..429ac30 --- /dev/null +++ b/src/panza3/data_preparation/prepare_raft_emails.py @@ -0,0 +1,92 @@ +import argparse +import gc +import json +import os +import sys +import time +from typing import Dict, List, Text + +import torch +from tqdm import tqdm + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.utils import rag +from panza.utils.documents import Email + +sys.path.pop(0) + + +def retrieve_similar_emails(batch, db, num_emails): + emails = [] + for email in batch: + try: + relevant_emails = db._similarity_search_with_relevance_scores( + email["email"], k=num_emails + ) + except Exception as e: + print(f"Error in RAG search: {e}") + relevant_emails = [] + return relevant_emails + + relevant_emails = [ + {"serialized_email": r[0].metadata["serialized_email"], "score": r[1]} + for r in relevant_emails + if r[0].page_content not in email["email"] + ] + email["relevant_emails"] = relevant_emails + emails.append(email) + + return emails + + +def main(): + parser = argparse.ArgumentParser( + description="Get similar emails for Retrieval Augmented Fine Tuning (RAFT)" + ) + parser.add_argument("--path-to-emails", help="Path to the cleaned emails") + parser.add_argument( + "--embedding-model", type=str, default="sentence-transformers/all-mpnet-base-v2" + ) + parser.add_argument("--db-path", type=str, default=None) + parser.add_argument("--index-name", type=str, default=None) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--rag-num-emails", type=int, default=7) + args = parser.parse_args() + + assert args.path_to_emails.endswith( + ".jsonl" + ), f"Expecting a .jsonl file, but given = {args.path_to_emails}" + + print(f"--> Reading emails from: {args.path_to_emails}") + + # Read emails + with open(args.path_to_emails, "r") as f: + lines = f.readlines() + json_lines = [json.loads(line.strip(",")) for line in lines] + print(f"--> # emails = {len(json_lines)}") + + embeddings_model = rag.get_embeddings_model(args.embedding_model) + db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) + + path_for_outputs = args.path_to_emails.rsplit(".jsonl", 1)[0] + "_raft.jsonl" + num_processed_emails = 0 + start_time = time.time() + with open(path_for_outputs, "w") as f: + for i in tqdm(range(0, len(json_lines), args.batch_size)): + # TODO(armand): Fix this print for batched inference + print(f"--> Processing batch {i}/{len(json_lines)}") + batch = json_lines[i : i + args.batch_size] + emails = retrieve_similar_emails(batch, db, args.rag_num_emails) + num_processed_emails += len(emails) + + for item in emails: + f.write(json.dumps(item)) + f.write("\n") + + elapsed_time = time.time() - start_time + print(f"{elapsed_time:.2f} seconds to process {len(json_lines)} emails.") + + +if __name__ == "__main__": + main() diff --git a/src/panza3/data_preparation/split_data.py b/src/panza3/data_preparation/split_data.py new file mode 100644 index 0000000..a487dcc --- /dev/null +++ b/src/panza3/data_preparation/split_data.py @@ -0,0 +1,43 @@ +import argparse +import json +import random +from datetime import datetime +from os import makedirs +from os.path import join + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data-path", type=str, default=None) + parser.add_argument("--output-data-dir", type=str, default=None) + parser.add_argument("--train-ratio", type=float, default=0.8) + parser.add_argument("--split-type", type=str, default="random") + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + makedirs(args.output_data_dir, exist_ok=True) + + with open(args.data_path, "r") as f: + data = f.readlines() + + if args.split_type == "random": + random.seed(args.seed) + random.shuffle(data) + elif args.split_type == "chronological": + data = sorted(data, key=lambda x: datetime.fromisoformat(json.loads(x)["date"])) + else: + raise ValueError("Invalid split type.") + + train_size = int(len(data) * args.train_ratio) + + with open(join(args.output_data_dir, "train.jsonl"), "w") as f: + for i in range(train_size): + f.write(data[i]) + + with open(join(args.output_data_dir, "test.jsonl"), "w") as f: + for i in range(train_size, len(data)): + f.write(data[i]) + + +if __name__ == "__main__": + main() diff --git a/src/panza3/data_preparation/summarize_emails.py b/src/panza3/data_preparation/summarize_emails.py new file mode 100644 index 0000000..ec92270 --- /dev/null +++ b/src/panza3/data_preparation/summarize_emails.py @@ -0,0 +1,202 @@ +import argparse +import gc +import json +import os +import sys +import time +from typing import Dict, List, Text + +import torch +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + +from panza.utils import prompting + +sys.path.pop(0) + +MDL = os.environ.get("PANZA_GENERATIVE_MODEL") +TEMP = 0.7 +TOP_P = 0.7 +TOP_K = 50 + + +class LLMSummarizer: + def __init__(self, model, dtype, temperature, top_k, top_p, summarization_prompt, load_in_4bit) -> None: + self.device = "cuda" + + if load_in_4bit: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4', + ) + else: + quant_config = None + + self.model = AutoModelForCausalLM.from_pretrained( + model, torch_dtype=dtype, device_map=self.device, quantization_config=quant_config, trust_remote_code=True + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model, model_max_length=self.model.config.max_position_embeddings, trust_remote_code=True + ) + self.tokenizer.padding_side = "left" + self.tokenizer.pad_token = self.tokenizer.eos_token + self.summarization_prompt = summarization_prompt + + _, self.prompt_end_wrapper, _, self.response_end_wrapper = ( + prompting.get_model_special_tokens(self.model.name_or_path) + ) + + # Save sampling parameters + self.temperature = temperature + self.top_k = top_k + self.top_p = top_p + + def prepare_batch_for_inference(self, emails: List[Dict]) -> List[Text]: + batch_with_prompt = [] + for item in emails: + prompt_with_email = self.summarization_prompt.format(email=item["email"]) + batch_with_prompt.append([{"role": "user", "content": prompt_with_email}]) + return batch_with_prompt + + def run_inference(self, emails: List[Dict]) -> List[Dict]: + gc.collect() + torch.cuda.empty_cache() + batch = self.prepare_batch_for_inference(emails) + + model_inputs = self.tokenizer.apply_chat_template( + batch, + return_tensors="pt", + add_generation_prompt=True, + padding=True, + truncation=True, + return_dict=True, + ) + model_inputs = model_inputs.to(self.device) + + generated_ids = self.model.generate( + **model_inputs, + max_new_tokens=1024, + do_sample=True, + temperature=self.temperature, + top_k=self.top_k, + top_p=self.top_p, + pad_token_id=self.tokenizer.pad_token_id, + ) + + outputs = self.tokenizer.batch_decode(generated_ids) + + # Extract generated text + summaries = [] + for output in outputs: + output = output.split(self.prompt_end_wrapper)[-1] + output = output.split(self.response_end_wrapper)[0] + output = output.strip() + summaries.append(output) + + return summaries + + +def generate_synthetic_instructions(emails: List[Dict], summarizer: LLMSummarizer): + summarized_emails = [] + + summaries = summarizer.run_inference(emails) + + for j, generated_text in enumerate(summaries): + + # Check if the outputs are valid + keyword = "Instruction: " + if generated_text.count(keyword) != 1: + print( + f"[WARNING] Skipping this sample:\n{generated_text}\n-----> " + f"[REASON] it contains none or multiple instances of the keyword = {keyword}, " + "but we expect exactly one" + ) + continue + + instruction = generated_text.split(keyword, 1)[1] + summarized_emails.append( + { + "email": emails[j]["email"], + "subject": emails[j]["subject"], + "summary": instruction, + "thread": emails[j]["thread"], + "date": emails[j]["date"], + } + ) + + return summarized_emails + + +def main(): + parser = argparse.ArgumentParser( + description="Transform emails into dataset for PANZA finetuning" + ) + parser.add_argument("--path-to-emails", help="Path to the cleaned emails") + parser.add_argument("--prompt-file", help="A path to file with prompt text") + parser.add_argument("--batch-size", type=int, help="Inference batch size") + parser.add_argument("--load-in-4bit", default=False, action='store_true', help="Wheather to load the model in 4bit precision (BNB)") + parser.add_argument("--fp32", default=False, action='store_true', help="Whether to use FP32 precision for computation") + args = parser.parse_args() + + assert args.path_to_emails.endswith( + ".jsonl" + ), f"Expecting a .jsonl file, but given = {args.path_to_emails}" + + assert os.path.exists( + args.prompt_file + ), f"Prompt file does not exist. Given path = {args.prompt_file}" + with open(args.prompt_file, "r") as file: + summarization_prompt = file.read() + + print(f"--> Reading emails from: {args.path_to_emails}") + print(f"--> Processing with batch_size {args.batch_size} and prompt = {summarization_prompt}") + print( + f"--> params for sampling:" + f"\t model = {MDL}" + f"\t temperature = {TEMP}" + f"\t top_p = {TOP_P}" + ) + + # Read emails + with open(args.path_to_emails, "r") as f: + lines = f.readlines() + json_lines = [json.loads(line.strip(',')) for line in lines] + print(f"--> # emails = {len(json_lines)}") + + summarizer = LLMSummarizer( + model=MDL, + dtype=torch.float32 if args.fp32 else torch.bfloat16, + temperature=TEMP, + top_p=TOP_P, + top_k=TOP_K, + summarization_prompt=summarization_prompt, + load_in_4bit=args.load_in_4bit + ) + + # Generate synthetic instructions + path_for_outputs = args.path_to_emails.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" + num_processed_emails = 0 + start_time = time.time() + with open(path_for_outputs, "w") as f: + for i in tqdm(range(0, len(json_lines), args.batch_size)): + # TODO(armand): Fix this print for batched inference + print(f"--> Processing batch {i}/{len(json_lines)}") + batch = json_lines[i : i + args.batch_size] + summarized_emails = generate_synthetic_instructions(batch, summarizer) + num_processed_emails += len(summarized_emails) + + # Write the summarized emails to a file + for item in summarized_emails: + f.write(json.dumps(item)) + f.write("\n") + + elapsed_time = time.time() - start_time + print(f"{elapsed_time:.2f} seconds to process {len(json_lines)} emails.") + + +if __name__ == "__main__": + main() diff --git a/src/panza3/interface/gui_b.py b/src/panza3/interface/gui_b.py new file mode 100644 index 0000000..c42e019 --- /dev/null +++ b/src/panza3/interface/gui_b.py @@ -0,0 +1,31 @@ +from panza3.entities.instruction import EmailInstruction, Instruction +from panza3.writer import PanzaWriter +import gradio as gr + + +class PanzaGUI: + def __init__(self, writer: PanzaWriter, **kwargs): + self.writer = writer + with gr.Blocks() as panza: + gr.Markdown("# Panza\n") + inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") + outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model") + inputbox.submit( + self.get_execute(), + [inputbox], + [outputbox], + ) + + panza.queue().launch(server_name="localhost", server_port=5003, share=True) + + def get_execute(self): + def execute(input): + instruction: Instruction = EmailInstruction(input) + stream = self.writer.run(instruction, stream=False) + #output = "" + #for chunk in stream: + # output += chunk + #yield stream.end() + yield stream + + return execute diff --git a/src/panza3/utils/documents.py b/src/panza3/utils/documents.py new file mode 100644 index 0000000..ccd7f85 --- /dev/null +++ b/src/panza3/utils/documents.py @@ -0,0 +1,46 @@ +import copy +import json +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass +from datetime import datetime +from typing import Dict, List, Optional, Union + + +@dataclass +class Document(ABC): + summary: Optional[str] = None + + @abstractmethod + def serialize(self) -> dict: + """Convert the document to a dictionary that can be serialized to JSON.""" + pass + + @classmethod + @abstractmethod + def deserialize(cls, data: Union[str, Dict]) -> "Document": + """Convert a serialized document into a Document object.""" + pass + + +@dataclass(kw_only=True) +class Email(Document): + email: str + subject: str + thread: List[str] + date: datetime + + def serialize(self) -> dict: + dictionary = asdict(self) + dictionary["date"] = self.date.isoformat() + return dictionary + + @classmethod + def deserialize(cls, data: Union[str, Dict]) -> "Email": + if isinstance(data, str): + dictionary = json.loads(data) + elif isinstance(data, dict): + dictionary = copy.deepcopy(data) + else: + raise ValueError(f"Cannot deserialize data of type {type(data)}. Must be str or dict.") + dictionary["date"] = datetime.fromisoformat(dictionary["date"]) + return cls(**dictionary) diff --git a/src/panza3/utils/prompting.py b/src/panza3/utils/prompting.py new file mode 100644 index 0000000..87e18d8 --- /dev/null +++ b/src/panza3/utils/prompting.py @@ -0,0 +1,175 @@ +from typing import List, Optional, Text + +from panza.utils.documents import Email + +MISTRAL_PROMPT_START_WRAPPER = "[INST] " +MISTRAL_PROMPT_END_WRAPPER = " [/INST]" +MISTRAL_RESPONSE_START_WRAPPER = "" +MISTRAL_RESPONSE_END_WRAPPER = "" + +LLAMA3_PROMPT_START_WRAPPER = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" +LLAMA3_PROMPT_END_WRAPPER = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +LLAMA3_RESPONSE_START_WRAPPER = "" +LLAMA3_RESPONSE_END_WRAPPER = "<|eot_id|>" + +PHI3_PROMPT_START_WRAPPER = "<|user|> " +PHI3_PROMPT_END_WRAPPER = "<|end|><|assistant|> " +PHI3_RESPONSE_START_WRAPPER = "" +PHI3_RESPONSE_END_WRAPPER = "<|end|>" + + +def create_prompt( + user_input: Text, + system_preamble: Text, + user_preamble: Text, + rag_preamble: Optional[Text] = None, + relevant_emails: Optional[List[Email]] = None, + thread_preamble: Optional[Text] = None, + thread_emails: Optional[List[Text]] = None, +) -> Text: + + if relevant_emails: + assert rag_preamble, "RAG preamble format must be provided if similar emails are provided." + rag_prompt = _create_rag_preamble_from_emails(rag_preamble, relevant_emails).strip() + else: + rag_prompt = "" + + if thread_emails: + assert thread_preamble, "Thread preamble format must be provided if thread is provided." + thread_prompt = _create_threading_preamble( + thread_preamble, thread_emails + ).strip() + else: + thread_prompt = "" + + system_preamble = system_preamble.strip() + user_preamble = user_preamble.strip() + + prompt = "" + if system_preamble: + prompt += f"{system_preamble}\n\n" + if user_preamble: + prompt += f"{user_preamble}\n\n" + if rag_prompt: + prompt += f"{rag_prompt}\n\n" + if thread_prompt: + prompt += f"{thread_prompt}\n\n" + prompt += f"Instruction: {user_input}" + + return prompt + + +def _create_rag_preamble_from_emails(rag_preamble_format: Text, emails: List[Email]) -> Text: + rag_context = _create_rag_context_from_emails(emails) + return rag_preamble_format.format(rag_context=rag_context) + + +def _create_rag_context_from_emails(emails: List[Email]) -> Text: + """Creates a RAG context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + SUBJECT: + E-MAIL CONTENT: + + + --- + + SUBJECT: + E-MAIL CONTENT: + + + --- + ... + """ + + rag_context = "" + for email in emails: + rag_context += ( + # f"SUBJECT: {email.metadata['subject']}\n" # TODO(armand): Handle subject metadata + f"E-MAIL CONTENT:\n{email.page_content}\n\n---\n\n" + ) + + return rag_context + + +def _create_threading_preamble( + threading_preamble_format: Text, thread: List[Text] +) -> Text: + threading_context = _create_threading_context(thread) + return threading_preamble_format.format(threading_context=threading_context) + + +def _create_threading_context(thread: List[Text]) -> Text: + """Creates a threading context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + + + --- + + + + --- + ... + """ + + threading_context = "" + for email in thread: + threading_context += f"{email}\n\n---\n\n" + + return threading_context + + +def load_preamble(path): + with open(path, "r") as file: + return file.read().strip() + + +# The user preamble must be edited by the user in order to work as intended. +# Here, we perform additional checks to make sure that that happened; if not, +# We issue a warning to the user. +def load_user_preamble(path): + with open(path, "r") as file: + lines = [l for l in file.readlines() if not l.strip().startswith("#")] + print(lines) + preamble = "".join(lines) + if "CHANGE ME" in preamble: + print( + "*" * 66 + + "\n* WARNING: User prompt preamble not customized. *\n* Please edit the preamble at prompt_preambles/user_preamble.txt *\n" + + "*" * 66 + ) + return preamble + + +def load_all_preambles(system_preamble, user_preamble, rag_preamble, thread_preamble): + system_preamble = load_preamble(system_preamble) if system_preamble else "" + user_preamble = load_user_preamble(user_preamble) if user_preamble else "" + rag_preamble = load_preamble(rag_preamble) if rag_preamble else "" + thread_preamble = load_preamble(thread_preamble) if thread_preamble else "" + return system_preamble, user_preamble, rag_preamble, thread_preamble + + +def get_model_special_tokens(model_name): + model_name = model_name.lower() + if "llama" in model_name: + prompt_start_wrapper = LLAMA3_PROMPT_START_WRAPPER + prompt_end_wrapper = LLAMA3_PROMPT_END_WRAPPER + response_start_wrapper = LLAMA3_RESPONSE_START_WRAPPER + response_end_wrapper = LLAMA3_RESPONSE_END_WRAPPER + elif "mistral" in model_name.lower(): + prompt_start_wrapper = MISTRAL_PROMPT_START_WRAPPER + prompt_end_wrapper = MISTRAL_PROMPT_END_WRAPPER + response_start_wrapper = MISTRAL_RESPONSE_START_WRAPPER + response_end_wrapper = MISTRAL_RESPONSE_END_WRAPPER + elif "phi" in model_name.lower(): + prompt_start_wrapper = PHI3_PROMPT_START_WRAPPER + prompt_end_wrapper = PHI3_PROMPT_END_WRAPPER + response_start_wrapper = PHI3_RESPONSE_START_WRAPPER + response_end_wrapper = PHI3_RESPONSE_END_WRAPPER + else: + raise ValueError(f"Presets missing for prompting model {model_name}") + + return prompt_start_wrapper, prompt_end_wrapper, response_start_wrapper, response_end_wrapper diff --git a/src/panza3/utils/rag.py b/src/panza3/utils/rag.py new file mode 100644 index 0000000..5653ed9 --- /dev/null +++ b/src/panza3/utils/rag.py @@ -0,0 +1,37 @@ +from typing import List + +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + + +def get_embeddings_model(model_name) -> Embeddings: + embeddings_model = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs={"device": "cpu"}, + encode_kwargs={"normalize_embeddings": False}, + ) + return embeddings_model + + +def create_vector_db(docs: List[Document], embeddings_model: Embeddings) -> VectorStore: + db = FAISS.from_documents(docs, embeddings_model) + return db + + +def load_vector_db_from_disk( + folder_path: str, index_name: str, embeddings_model: Embeddings +) -> VectorStore: + try: + db = FAISS.load_local( + folder_path=folder_path, + embeddings=embeddings_model, + index_name=index_name, + allow_dangerous_deserialization=True, # Allows pickle deserialization + ) + print("Faiss index loaded ") + return db + except Exception as e: + print("Fiass index loading failed \n", e) From 1e6259706e45956214f1124ba191f56bb2a09be5 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 17:02:15 +0100 Subject: [PATCH 104/112] Once again, try to centralize the main README. --- README_panza3.md | 234 ----------------------------------------------- 1 file changed, 234 deletions(-) delete mode 100644 README_panza3.md diff --git a/README_panza3.md b/README_panza3.md deleted file mode 100644 index c2c1d09..0000000 --- a/README_panza3.md +++ /dev/null @@ -1,234 +0,0 @@ -
- panza demo -
- -# Panza: A personal email assistant, trained and running on-device - - - -## What is Panza? - - - - -Panza is an automated email assistant customized to your writing style and past email history. \ -Its main features are as follows: -* Panza produces a fine-tuned LLM that matches your writing style, pairing it with a Retrieval-Augmented Generation (RAG) component which helps it produce relevant emails. -* Panza **can be trained and run entirely locally**. Currently, it requires a single GPU with -16-24 GiB of memory, but we also plan to release a CPU-only version. **At no point in training or execution is your data shared with the entities that trained the original LLMs, with LLM distribution services such as Huggingface, or with us.** -* Training and execution are also quick - for a dataset on the order of 1000 emails, training Panza takes well under an hour, and generating a new email takes a few seconds at most. - -
- panza logo -
- - -## TODO: Prerequisites -- Your emails, exported to `mbox` format (see tutorial below). -- A computer, preferably with a NVIDIA GPU with at least 24 GiB of memory (alternatively, check out [running in Google Colab](#cloud-try-out-panza-in-google-colab)). -- A Hugging Face [account](https://huggingface.co/login) to download the models (free of charge). -- [Optional] A Weights & Biases [account](https://wandb.ai/login) to log metrics during training (free of charge). -- Basic Python and Unix knowledge, such as building environments and running python scripts. -- *No prior LLMs experience is needed*. - - -## How it works - -### :film_projector: Step 1: Data playback - -For most email clients, it is possible to download a user's past emails in a machine-friendly .mbox format. For example, GMail allows you to do this via [Google Takeout](https://takeout.google.com), whereas Thunderbird allows one to do this via various plugins. - -One key part of Panza is a dataset-generation technique we call **data playback**: Given some of your past emails in .mbox format, we automatically create a training set for Panza by using a pretrained LLM to summarize the emails in instruction form; each email becomes a `(synthetic instruction, real email)` pair. -Given a dataset consisting of all pairs, we use these pairs to "play back" your sent emails: the LLM receives only the instruction, and has to generate the "ground truth" email as a training target. - -We find that this approach is very useful for the LLM to "learn" the user's writing style. - - -### :weight_lifting: Step 2: Local Fine-Tuning via Robust Adaptation (RoSA) - -We then use parameter-efficient finetuning to train the LLM on this dataset, locally. We found that we get the best results with the [RoSA method](https://arxiv.org/pdf/2401.04679.pdf), which combines low-rank (LoRA) and sparse finetuning. If parameter efficiency is not a concern, that is, you have a more powerful GPU, then regular, full-rank/full-parameter finetuning can also be used. We find that a moderate amount of further training strikes the right balance between matching the writer's style without memorizing irrelevant details in past emails. - - -### :owl: Step 3: Serving via RAG - -Once we have a custom user model, Panza can be run locally together with a Retrieval-Augmented Generation (RAG) module. Specifically, this functionality stores past emails in a database and provides a few relevant emails as context for each new query. This allows Panza to better insert specific details, such as a writer's contact information or frequently used Zoom links. - -The overall structure of Panza is as follows: -
- panza logo -
- -## Installation - -### Conda -1. Make sure you have a version of [conda](https://docs.anaconda.com/free/miniconda/miniconda-install/) installed. -2. Create a new conda environment named 'panza' (or something else) and activate it: -``` bash -conda create -n panza python=3.10 -y -conda activate panza -``` -3. Install the required packages: -``` bash -pip install . -``` -4. If you want to also finetune models using Panza, you will need to install the additional packages: -``` bash -pip install .[training] -``` - -## TODO: :rocket: Getting started - -To quickly get started with building your own personalized email assistant, follow the steps bellow: - - - - -### Step 0: Download your sent emails - -
- Expand for detailed download instructions. - - We provide a description for doing this for GMail via Google Takeout. - - 1. Go to [https://takeout.google.com/](https://takeout.google.com/). - 2. Click `Deselect all`. - 3. Find `Mail` section (search for the phrase `Messages and attachments in your Gmail account in MBOX format`). - 4. Select it. - 5. Click on `All Mail data included` and deselect everything except `Sent`. - 6. Scroll to the bottom of the page and click `Next step`. - 7. Click on `Create export`. - 8. Wait for download link to arrive in your inbox. - 9. Download `Sent.mbox` and place it in the `data/` directory. - - For Outlook accounts, we suggest doing this via a Thunderbird plugin for exporting a subset of your email as an MBOX format, such as [this add-on](https://addons.thunderbird.net/en-us/thunderbird/addon/importexporttools-ng/). -
- -At the end of this step you should have the downloaded emails placed inside `data/Sent.mbox`. - - -### Step 1: Environment configuration - - -Panza is configured through a set of yaml configurations defined in `configs/`. There is a single high-level config under `configs/base.yaml`, and the rest are organized under the main functionalities of the code. -Note that these task-specific configs can, in some cases, be used to override base configs. - Specific use cases, such as hyperparameter tuning, are covered in more detail in `scripts/README.md`. (TODO jen: write this up.) - -1. Data preparation: `configs/data_preparation.yaml`. Additionally, a custom user config must be added under `config/users/` (see below). -1. Finetuning: the main config is in `configs/panza_finetuning.yaml` and the method-specific ones are in `configs/finetuning/` -1. Serving: Serving consists of two parts - a serving infrastructure (that we call 'writer') that runs the LLM and so converts prompts to Panza outputs, and an `interface`, which presents the outputs in a useful form - through a command-line interface, a web interface, a gmail client (TODO:Sean), or in a bulk `.json` format (useful for evaluation). The configs for serving are in `panza_writer.yaml`, and for the interfaces, under `configs/interfaces`. - - -These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. -:warning: Before continuing, make sure you complete the following setup: -- Copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. If you modify the default yaml, you will need specify `user=default` as an extra flag in the succeeding steps. -- In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. -- Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. - - -Additionally, please perform the following login steps to be able to download the base model. - - Login to Hugging Face to be able to download pretrained models: `huggingface-cli login`. - - [Optional] Login to Weights & Biases to log metrics during training: `wandb login`. Then, set `wandb_disabled=false` in `configs/finetuning/base.yaml`. - - -You are now ready to move to `scripts`. -``` bash -cd scripts -``` - -### Step 2: Extract emails - - -1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
- This scripts takes care of all the prerequisites before training (expand for details). - - - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. - - Creates synthetic prompts for your emails as described in the [data playback](#film_projector-step-1-data-playback) section. The results are stored in `data/_clean_summarized.jsonl` and you can inspect the `"summary"` field. - - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`. -
-**NB**: if you did not change the default configuration in `user/default.yaml` to reflect your particulars but rather created a new file, you need to add the additional flag to the above command where you specify `user=x` where your config file was named `x.yaml`. - -
- FAQs. - When running the above script, you may encounter an OutOfMemoryError. If this is the case, you can either: -
    -
  1. Reduce the batch size for the data processing step. This can be found in configs/panza_preparation.yaml. -
  2. Move to a machine that has more memory. -
-
- -ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. - -### Step 3: Train a LLM on your emails - - -We currently support `LLaMA3-8B-Instruct` and `Mistral-Instruct-v0.2` LLMs as base models; the former is the default, but we obtained good results with either model. - -1. [Recommended] For parameter efficient fine-tuning, run `./train_rosa.sh`. -If a larger GPU is available and full-parameter fine-tuning is possible, run `./train_fft.sh`. - -2. We have prepopulated the training configs with parameter values that worked best for us. We recommend you try those first, but you can also experiment with different hyper-parameters by passing extra arguments to the training script, such as `lr`, `lora_lr`, `num_epochs`. All the trained models are saved in the `checkpoints` directory. - -Examples: -``` bash -CUDA_VISIBLE_DEVICES=X ./train_rosa.sh # Will use the default parameters. - -CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. -``` -
- FAQs. - The bash scripts that are used to execute the finetuning procedure assume by default that your username is what is returned by the whoami command. This is used to locate the name of the user configs inside the configs/user directory as above. If you directly modified default.yaml, or created another yaml file where the name of that file does not match with the output of whoami, there will be an error. This is an easy fix. You can either: -
    -
  1. Change the name of the yaml file to be the output of whoami. -
  2. You can override the username manually when you launch the bash script by adding user=x where x is the name of the yaml file you created. For example: ./train_rosa.sh user=alonso -
-
- If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. -

- A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml. -
- - -### Step 5: Launch Panza! - - -- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. -- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? - - -:email: **Have fun with your new email writing assistant!** :email: - - - - -## :microscope: Advanced usage -- [Data Preparation Guide](./scripts/README.md#data-guide) -- [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) -- [Prompt Preambles Tutorial](prompt_preambles/README.md) - -## :woman_technologist: Contributing -If you liked our work and want to contribute to improve the system, please feel free to do so! Make a _fork_ of our repository and once you have made your changes, submit a pull request so that we can review! - -One thing to mention: we want to make sure that we all adhere to the same coding standards, so we have added Black, a code formatter, as a prehook. To ensure that all your files are formatted with Black, do the following: - -1. Install the necessary dependencies -``` -pip install .[contributing] -``` - -2. Run the precommit command -``` -pre-commit install -``` - -3. Continue adding code as usual. All your code will be formatted by Black before commiting! - -## Authors - -Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): - -Dan Alistarh, Eugenia Iofinova, Eldar Kurtic, Ilya Markov, Armand Nicolicioiu, Mahdi Nikdan, Andrei Panferov, and Nir Shavit. - -Contact: dan.alistarh@ist.ac.at - -We thank our collaborators Michael Goin and Tony Wang at NeuralMagic and MIT for their helpful testing and feedback. From 71a85c93db2248adad1c05f57344f2982c594c32 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Tue, 12 Nov 2024 17:03:43 +0100 Subject: [PATCH 105/112] Update the README --- README.md | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d9824d7..033dd26 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ Note that these task-specific configs can, in some cases, be used to override ba These scripts are described in more detail in `scripts/README.md`, but a few customizations need to happen immediately. :warning: Before continuing, make sure you complete the following setup: -- Optionally, copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. +- Copy `users/default.yaml` to `users/[YOURNAME].yaml`. If this is skipped, perform the following modifications on `users/default.yaml` directly. A useful tip for choosing the name of `[YOURNAME]` is to set it to the output of `whoami`. If you modify the default yaml, you will need specify `user=default` as an extra flag in the succeeding steps. - In the user config, set the email address and username. The email address should be the sender address in the exported emails. (Panza uses this to edit out responses and other emails sent by a different author in the `.mbox` dump.). The username does not have to link to the email itself - it is simply used as a name for the various data files that will come out of the data preparation process. A handy way to set this is if you set it to be the output of the `whoami` call in your shell. - Modify the personal prompt in `prompt_preambles/user_preamble.txt` to include some basic information about yourself that Panza can use to customize your emails with your correct full name, address, phone number, etc. @@ -146,6 +146,16 @@ cd scripts - Splits data into training and test subsets. See `data/train.jsonl` and `data/test.jsonl`. - Creates a vector database from the embeddings of the training emails which will later be used for *Retrieval-Augmented Generation (RAG)*. See `data/.pkl` and `data/.faiss`.
+**NB**: if you did not change the default configuration in `user/default.yaml` to reflect your particulars but rather created a new file, you need to add the additional flag to the above command where you specify `user=x` where your config file was named `x.yaml`. + +
+ FAQs. + When running the above script, you may encounter an OutOfMemoryError. If this is the case, you can either: +
    +
  1. Reduce the batch size for the data processing step. This can be found in configs/panza_preparation.yaml. +
  2. Move to a machine that has more memory. +
+
ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. @@ -161,9 +171,9 @@ If a larger GPU is available and full-parameter fine-tuning is possible, run `./ Examples: ``` bash -./train_rosa.sh # Will use the default parameters. +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh # Will use the default parameters. -./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. ```
FAQs. @@ -174,6 +184,8 @@ Examples:
If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use. +

+ A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml.
@@ -194,6 +206,23 @@ Examples: - [Hyper-Parameter Tuning Guide](./scripts/README.md#hyper-parameter-tuning-guide) - [Prompt Preambles Tutorial](prompt_preambles/README.md) +## :woman_technologist: Contributing +If you liked our work and want to contribute to improve the system, please feel free to do so! Make a _fork_ of our repository and once you have made your changes, submit a pull request so that we can review! + +One thing to mention: we want to make sure that we all adhere to the same coding standards, so we have added Black, a code formatter, as a prehook. To ensure that all your files are formatted with Black, do the following: + +1. Install the necessary dependencies +``` +pip install .[contributing] +``` + +2. Run the precommit command +``` +pre-commit install +``` + +3. Continue adding code as usual. All your code will be formatted by Black before commiting! + ## Authors Panza was conceived by Nir Shavit and Dan Alistarh and built by the [Distributed Algorithms and Systems group](https://ist.ac.at/en/research/alistarh-group/) at IST Austria. The contributors are (in alphabetical order): From 73308a0544e6eeb18c0a89999502491a0284a84c Mon Sep 17 00:00:00 2001 From: Jen Iofinova Date: Wed, 13 Nov 2024 10:14:14 +0100 Subject: [PATCH 106/112] Update README.md remove resolved TODO --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 033dd26..032396b 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,6 @@ cd scripts -ODO Jen: This doesn't work anymore, because we make the RAG database right away. If you wish to eliminate any emails from the training set (e.g. containing certain personal information), you can simply remove the corresponding rows. ### Step 3: Train a LLM on your emails From e5a9e44276df339e5fc7e3dcf1dec4c38af96f22 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Wed, 13 Nov 2024 10:27:25 +0100 Subject: [PATCH 107/112] update hyperparameter tuning guide --- scripts/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index d058937..00e2e91 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -59,18 +59,18 @@ python runner.py user=jen interfaces=json writer/llm=[peft|transformers] checkpo :bulb: We recommend having between 128 and 1000 sent emails as training targets. Less than 128 might cause the model to overfit, while we haven't found that more than 1000 emails help for the style transfer. However, we encourage you to include as many emails as available in the RAG database, as they will provide the model with additional context. To sub-select training data, you can perform the usual flow with all of your data (export, run `extract_emails.sh` and `prepare_dataset.sh`), and then simply remove all but your target number of rows from the resulting `train.jsonl` in the `data`. -:bulb: To merge data from multiple mailboxes (such as combining your personal and work emails), run `extract_emails.sh` on each `.mbox` file, remembering to change the value of `PANZA_EMAIL_ADDRESS` in `config.sh` for every inbox. Then simply concatenate the resulting `[email_id].clean.jsonl` files to one, and use that file's `email_id` for the `PANZA_EMAIL_ADDRESS` argument in `config.sh` going forward. Make sure that the `prepare_dataset.sh` script is run _after_ the merge. +:bulb: To merge data from multiple mailboxes (such as combining your personal and work emails), run `extract_emails.sh` on each `.mbox` file, remembering to change the value of `user.email_address` and `user.user_name` in `config.sh` for every inbox. Then simply concatenate the resulting `[user.user_name].clean.jsonl` files to one, and use that file's `user.user_name` going forward. Make sure that the `prepare_dataset.sh` script is run _after_ the merge with `force_extract_clean_emails=false`. ### Hyper-Parameter Tuning Guide To get the most out of Panza, it is essential to find good hyper-parameters for the fine-tuning process. -Specifically the key parameters to consider are the learning rates (`LR` and `LORA_LR`, in the case of RoSA fine-tuning) and (`NUM_EPOCHS`) parameters, whose values should be adjusted based on your amount of data and model in use. +Specifically the key parameters to consider are the learning rates (`trainer.optimizer.lr=0.1` and `trainer.optimizer.rosa.lora_lr`, in the case of RoSA fine-tuning) and (`trainer.optimizer.max_duration`) parameters, whose values should be adjusted based on your amount of data and model in use. Here are some general guidelines for hyper-parameter fine-tuning: * In our experience, a good target for the Perplexity over the training set (displayed during and at the end of the training run) is in the range 1-1.5 (for full fine-tuning) to 2-3 (for RoSA tuning). At that point, Panza should be able to reproduce your writing style quite faithfully. -* To reach this target, you can ajust two parameters: the length of training (`NUM_EPOCHS`) and the learning rates (`LR` for full fine-tuning and `LR` and `LORA_LR` for RoSA). +* To reach this target, you can ajust two parameters: the length of training (`trainer.optimizer.max_duration`) and the learning rates (`trainer.optimizer.lr` for full fine-tuning and `trainer.optimizer.lr` and `trainer.optimizer.rosa.lora_lr` for RoSA). * Specifically, for full fine-tuning we have found 3 training epochs to be sufficient. For RoSA fine-tuning, one usually needs 5-7 epochs for best results. -* Regarding the learning rates, we have already provided stable default values (around 1e-5 for both LLaMA3-8B and Mistral). You may adjust these depending on the amount of your local data. +* Regarding the learning rates, we have already provided stable default values (around 1e-5 for LLaMA3-8B , Phi-3.5-mini, and Mistral). You may adjust these depending on the amount of your local data. * We have found that setting these values too low will yield default "impersonal'' answers (specifically, the same answers as the base model with some context). Setting them too high will lead the model to "overfit" to the user data, to the point where a lot of the latent model "knowledge" is lost. The key to good performance is to find a good middle ground between these two scenarios. From b3bc00fc42dd0a995b53d9a907edc5b87e0dec0d Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Wed, 13 Nov 2024 11:24:59 +0100 Subject: [PATCH 108/112] Refactor panza3 -> panza --- README.md | 6 +- configs/finetuning/base.yaml | 2 +- configs/interfaces/cli.yaml | 2 +- configs/interfaces/gui.yaml | 2 +- configs/interfaces/json.yaml | 2 +- configs/interfaces/web.yaml | 2 +- configs/user/default.yaml | 2 +- configs/user/jen.yaml | 9 - configs/writer/email.yaml | 2 +- configs/writer/llm/ollama.yaml | 2 +- configs/writer/llm/peft.yaml | 2 +- configs/writer/llm/transformers.yaml | 2 +- configs/writer/prompting/email_prompting.yaml | 2 +- configs/writer/prompting/retriever/faiss.yaml | 2 +- configs/writer/prompting/retriever/none.yaml | 2 +- .../prompting/summarization_prompting.yaml | 2 +- configs/writer/summary.yaml | 2 +- scripts/config.sh | 59 - scripts/eval_base_model.sh | 102 -- scripts/eval_model.sh | 34 - scripts/eval_rosa.sh | 114 -- scripts/extract_emails.sh | 11 - scripts/finetune.py | 237 ---- scripts/finetune_simple.py | 1016 ----------------- scripts/output.tx | 0 scripts/prepare_data.py | 10 +- scripts/prepare_dataset.sh | 66 -- scripts/prepare_train_eval.sh | 63 + scripts/run_panza_cli.sh | 31 - scripts/runner.py | 2 +- scripts/train_fft.sh | 2 +- scripts/train_rosa.sh | 4 +- src/panza/__init__.py | 55 + .../data_preparation/create_vector_store.py | 61 - src/panza/data_preparation/extract_emails.py | 84 +- src/{panza3 => panza}/data_preparation/rag.py | 0 src/{panza3 => panza}/entities/__init__.py | 0 src/{panza3 => panza}/entities/document.py | 0 src/{panza3 => panza}/entities/instruction.py | 0 .../evaluation/.evaluate_summaries.py.swp | Bin 16384 -> 0 bytes src/panza/evaluation/base_inference.py | 117 -- .../console_interactive_inference.py | 66 -- src/panza/evaluation/evaluate | 180 --- src/panza/evaluation/evaluate_backup.py | 180 --- src/panza/evaluation/evaluation.py | 194 ---- src/panza/evaluation/gui_inference.py | 89 -- src/panza/evaluation/ollama_inference.py | 83 -- .../evaluation/ollama_service_inference.py | 77 -- src/panza/evaluation/service_inference.py | 103 -- .../finetuning/preprocessing.py | 2 +- src/{panza3 => panza}/finetuning/train.py | 8 +- src/{panza3 => panza}/interface/__init__.py | 0 src/{panza3 => panza}/interface/cli.py | 4 +- src/{panza3 => panza}/interface/gui.py | 4 +- src/{panza3 => panza}/interface/gui_b.py | 10 +- src/{panza3 => panza}/interface/json.py | 4 +- src/{panza3 => panza}/interface/web.py | 4 +- src/{panza3 => panza}/llm/__init__.py | 0 src/{panza3 => panza}/llm/base.py | 0 src/{panza3 => panza}/llm/local.py | 0 src/{panza3 => panza}/llm/ollama.py | 0 src/{panza3 => panza}/prompting/__init__.py | 0 src/{panza3 => panza}/prompting/base.py | 0 .../prompting/email_prompting.py | 0 .../prompting/summarization_prompting.py | 0 src/{panza3 => panza}/prompting/utils.py | 0 src/{panza3 => panza}/retriever/__init__.py | 0 src/{panza3 => panza}/retriever/base.py | 0 src/{panza3 => panza}/retriever/faiss.py | 0 src/{panza3 => panza}/retriever/none.py | 0 src/panza/utils/prompting.py | 130 +++ src/panza/utils/rag.py | 34 + src/{panza3 => panza}/writer.py | 0 src/panza3/__init__.py | 10 - src/panza3/data_preparation/extract_emails.py | 208 ---- .../data_preparation/prepare_raft_emails.py | 92 -- src/panza3/data_preparation/split_data.py | 43 - .../data_preparation/summarize_emails.py | 202 ---- src/panza3/utils/documents.py | 46 - src/panza3/utils/prompting.py | 175 --- src/panza3/utils/rag.py | 37 - tests/conftest.py | 7 +- tests/test_entities.py | 2 +- tests/test_local_llm.py | 4 +- tests/test_ollama_llm.py | 4 +- tests/test_prompting.py | 23 +- tests/test_retriever.py | 4 +- tests/test_writer.py | 8 +- 88 files changed, 404 insertions(+), 3746 deletions(-) delete mode 100644 configs/user/jen.yaml delete mode 100755 scripts/config.sh delete mode 100755 scripts/eval_base_model.sh delete mode 100755 scripts/eval_model.sh delete mode 100755 scripts/eval_rosa.sh delete mode 100755 scripts/extract_emails.sh delete mode 100644 scripts/finetune.py delete mode 100644 scripts/finetune_simple.py delete mode 100644 scripts/output.tx delete mode 100755 scripts/prepare_dataset.sh create mode 100755 scripts/prepare_train_eval.sh delete mode 100755 scripts/run_panza_cli.sh delete mode 100644 src/panza/data_preparation/create_vector_store.py rename src/{panza3 => panza}/data_preparation/rag.py (100%) rename src/{panza3 => panza}/entities/__init__.py (100%) rename src/{panza3 => panza}/entities/document.py (100%) rename src/{panza3 => panza}/entities/instruction.py (100%) delete mode 100644 src/panza/evaluation/.evaluate_summaries.py.swp delete mode 100644 src/panza/evaluation/base_inference.py delete mode 100644 src/panza/evaluation/console_interactive_inference.py delete mode 100644 src/panza/evaluation/evaluate delete mode 100644 src/panza/evaluation/evaluate_backup.py delete mode 100644 src/panza/evaluation/evaluation.py delete mode 100644 src/panza/evaluation/gui_inference.py delete mode 100644 src/panza/evaluation/ollama_inference.py delete mode 100644 src/panza/evaluation/ollama_service_inference.py delete mode 100644 src/panza/evaluation/service_inference.py rename src/{panza3 => panza}/finetuning/preprocessing.py (97%) rename src/{panza3 => panza}/finetuning/train.py (99%) rename src/{panza3 => panza}/interface/__init__.py (100%) rename src/{panza3 => panza}/interface/cli.py (79%) rename src/{panza3 => panza}/interface/gui.py (89%) rename src/{panza3 => panza}/interface/gui_b.py (81%) rename src/{panza3 => panza}/interface/json.py (98%) rename src/{panza3 => panza}/interface/web.py (95%) rename src/{panza3 => panza}/llm/__init__.py (100%) rename src/{panza3 => panza}/llm/base.py (100%) rename src/{panza3 => panza}/llm/local.py (100%) rename src/{panza3 => panza}/llm/ollama.py (100%) rename src/{panza3 => panza}/prompting/__init__.py (100%) rename src/{panza3 => panza}/prompting/base.py (100%) rename src/{panza3 => panza}/prompting/email_prompting.py (100%) rename src/{panza3 => panza}/prompting/summarization_prompting.py (100%) rename src/{panza3 => panza}/prompting/utils.py (100%) rename src/{panza3 => panza}/retriever/__init__.py (100%) rename src/{panza3 => panza}/retriever/base.py (100%) rename src/{panza3 => panza}/retriever/faiss.py (100%) rename src/{panza3 => panza}/retriever/none.py (100%) rename src/{panza3 => panza}/writer.py (100%) delete mode 100644 src/panza3/__init__.py delete mode 100644 src/panza3/data_preparation/extract_emails.py delete mode 100644 src/panza3/data_preparation/prepare_raft_emails.py delete mode 100644 src/panza3/data_preparation/split_data.py delete mode 100644 src/panza3/data_preparation/summarize_emails.py delete mode 100644 src/panza3/utils/documents.py delete mode 100644 src/panza3/utils/prompting.py delete mode 100644 src/panza3/utils/rag.py diff --git a/README.md b/README.md index 032396b..a7b983f 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ cd scripts ### Step 2: Extract emails -1. Run `CUDA_VISIBLE_DEVICES=X python ./prepare_data.py`.
+1. Run `CUDA_VISIBLE_DEVICES=X ./prepare_data.sh`.
This scripts takes care of all the prerequisites before training (expand for details). - Extracts your emails in text format to `data/_clean.jsonl` which you can manually inspect. @@ -184,14 +184,14 @@ CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-
If you wish to add CUDA_VISIBLE_DEVICES to specify a specific GPU, please add this in the shell script directly by export CUDA_VISIBLE_DEVICES=x where x is the ID of the GPU you wish to use.

- A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza3/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml. + A known issue is that when you fine-tune your model with RAG, there can be a case when the tokenization of the dataset seemingly hangs. This is due to a known bug with with HF's map function where n_proc>1. To alleviate this issue, you can set torch.set_num_threads(1) in src/panza/finetuning/train.py or set the equivalent parameter in configs/finetuning/rosa.yaml.
### Step 5: Launch Panza! -- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 python3 runner.py user=USERNAME interfaces=cli writer/llm=transformers`. +- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 ./runner.sh user=USERNAME interfaces=cli writer/llm=transformers`. - To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? diff --git a/configs/finetuning/base.yaml b/configs/finetuning/base.yaml index cf594fd..59eb3fb 100644 --- a/configs/finetuning/base.yaml +++ b/configs/finetuning/base.yaml @@ -36,7 +36,7 @@ train_loader: split: train hf_kwargs: data_files: ${user.data_dir}/train.jsonl - preprocessing_fn: panza3.finetuning.preprocessing:panza_preprocessing_function + preprocessing_fn: panza.finetuning.preprocessing:panza_preprocessing_function max_seq_len: ${finetuning.max_seq_len} allow_pad_trimming: false decoder_only_format: true diff --git a/configs/interfaces/cli.yaml b/configs/interfaces/cli.yaml index 1b69409..1c18948 100644 --- a/configs/interfaces/cli.yaml +++ b/configs/interfaces/cli.yaml @@ -1 +1 @@ -_target_: panza3.interface.PanzaCLI \ No newline at end of file +_target_: panza.interface.PanzaCLI \ No newline at end of file diff --git a/configs/interfaces/gui.yaml b/configs/interfaces/gui.yaml index 49b5515..8a6497d 100644 --- a/configs/interfaces/gui.yaml +++ b/configs/interfaces/gui.yaml @@ -1 +1 @@ -_target_: panza3.interface.PanzaGUI \ No newline at end of file +_target_: panza.interface.PanzaGUI \ No newline at end of file diff --git a/configs/interfaces/json.yaml b/configs/interfaces/json.yaml index 4a51fbc..a91f254 100644 --- a/configs/interfaces/json.yaml +++ b/configs/interfaces/json.yaml @@ -6,4 +6,4 @@ checkpoint: ${checkpoint} panza_workspace: ${panza_workspace} compute_metrics: true username: ${user.username} -_target_: panza3.interface.PanzaJSON \ No newline at end of file +_target_: panza.interface.PanzaJSON \ No newline at end of file diff --git a/configs/interfaces/web.yaml b/configs/interfaces/web.yaml index 0ac3c70..59cfce9 100644 --- a/configs/interfaces/web.yaml +++ b/configs/interfaces/web.yaml @@ -1,2 +1,2 @@ port: 5001 -_target_: panza3.interface.PanzaWebService \ No newline at end of file +_target_: panza.interface.PanzaWebService \ No newline at end of file diff --git a/configs/user/default.yaml b/configs/user/default.yaml index 4899d3a..8f2b9e6 100644 --- a/configs/user/default.yaml +++ b/configs/user/default.yaml @@ -1,5 +1,5 @@ email_address: "abc@xyz.com" # Change this to your email address! -username: "jen.iofinova" # TODO(armand): Use custom resolver to extract username from email address. +username: "abc" # This identifies the user in the users directory and the names of the emails files. data_dir: ${panza_workspace}/data diff --git a/configs/user/jen.yaml b/configs/user/jen.yaml deleted file mode 100644 index f2b4203..0000000 --- a/configs/user/jen.yaml +++ /dev/null @@ -1,9 +0,0 @@ -email_address: "jen.iofinova@gmail.com" # Change this to your email address! -username: "jen.iofinova" # TODO(armand): Use custom resolver to extract username from email address. - -data_dir: ${panza_workspace}/data - -system_preamble_path: ${panza_workspace}/prompt_preambles/system_preamble.txt -user_preamble_path: ${panza_workspace}/prompt_preambles/user_preamble.txt -rag_preamble_path: ${panza_workspace}/prompt_preambles/rag_preamble.txt -thread_preamble_path: ${panza_workspace}/prompt_preambles/thread_preamble.txt \ No newline at end of file diff --git a/configs/writer/email.yaml b/configs/writer/email.yaml index 3b816d4..49de780 100644 --- a/configs/writer/email.yaml +++ b/configs/writer/email.yaml @@ -2,4 +2,4 @@ defaults: - llm: transformers - prompting: email_prompting -_target_: panza3.writer.PanzaWriter +_target_: panza.writer.PanzaWriter diff --git a/configs/writer/llm/ollama.yaml b/configs/writer/llm/ollama.yaml index 0a183b0..e384a6b 100644 --- a/configs/writer/llm/ollama.yaml +++ b/configs/writer/llm/ollama.yaml @@ -1,6 +1,6 @@ defaults: - sampling: random -_target_: panza3.llm.OllamaLLM +_target_: panza.llm.OllamaLLM name: "custom" gguf_file: "custom.gguf" diff --git a/configs/writer/llm/peft.yaml b/configs/writer/llm/peft.yaml index fcae93a..2c0a892 100644 --- a/configs/writer/llm/peft.yaml +++ b/configs/writer/llm/peft.yaml @@ -1,7 +1,7 @@ defaults: - sampling: random -_target_: panza3.llm.PeftLLM +_target_: panza.llm.PeftLLM name: ${checkpoint} checkpoint: ${checkpoint} device: "cuda" # Alternatively, "cuda" diff --git a/configs/writer/llm/transformers.yaml b/configs/writer/llm/transformers.yaml index ddfb3c6..305dc25 100644 --- a/configs/writer/llm/transformers.yaml +++ b/configs/writer/llm/transformers.yaml @@ -1,7 +1,7 @@ defaults: - sampling: random -_target_: panza3.llm.TransformersLLM +_target_: panza.llm.TransformersLLM name: ${checkpoint} checkpoint: ${checkpoint} device: "cuda" diff --git a/configs/writer/prompting/email_prompting.yaml b/configs/writer/prompting/email_prompting.yaml index 235fc3e..27b458a 100644 --- a/configs/writer/prompting/email_prompting.yaml +++ b/configs/writer/prompting/email_prompting.yaml @@ -1,7 +1,7 @@ defaults: - retriever: faiss -_target_: panza3.prompting.EmailPromptBuilder +_target_: panza.prompting.EmailPromptBuilder system_preamble: ${load_preamble:${user.system_preamble_path}} user_preamble: ${load_user_preamble:${user.user_preamble_path}} diff --git a/configs/writer/prompting/retriever/faiss.yaml b/configs/writer/prompting/retriever/faiss.yaml index 9a00354..fbf35c9 100644 --- a/configs/writer/prompting/retriever/faiss.yaml +++ b/configs/writer/prompting/retriever/faiss.yaml @@ -1,4 +1,4 @@ -_target_: panza3.retriever.FaissRetriever +_target_: panza.retriever.FaissRetriever db_path: ${user.data_dir} index_name: ${user.username} embedding_model: ${embedding_model} diff --git a/configs/writer/prompting/retriever/none.yaml b/configs/writer/prompting/retriever/none.yaml index 68be9b6..8015504 100644 --- a/configs/writer/prompting/retriever/none.yaml +++ b/configs/writer/prompting/retriever/none.yaml @@ -1 +1 @@ -_target_: panza3.retriever.NoneRetriever \ No newline at end of file +_target_: panza.retriever.NoneRetriever \ No newline at end of file diff --git a/configs/writer/prompting/summarization_prompting.yaml b/configs/writer/prompting/summarization_prompting.yaml index 6e44871..98449cc 100644 --- a/configs/writer/prompting/summarization_prompting.yaml +++ b/configs/writer/prompting/summarization_prompting.yaml @@ -1,3 +1,3 @@ -_target_: panza3.prompting.SummarizationPromptBuilder +_target_: panza.prompting.SummarizationPromptBuilder summarization_prompt: ${load_preamble:${panza_workspace}/prompt_preambles/summarization_prompt.txt} diff --git a/configs/writer/summary.yaml b/configs/writer/summary.yaml index 76d8f83..827c4dd 100644 --- a/configs/writer/summary.yaml +++ b/configs/writer/summary.yaml @@ -2,4 +2,4 @@ defaults: - llm: transformers - prompting: summarization_prompting -_target_: panza3.writer.PanzaWriter +_target_: panza.writer.PanzaWriter diff --git a/scripts/config.sh b/scripts/config.sh deleted file mode 100755 index 08b1479..0000000 --- a/scripts/config.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -export PANZA_EMAIL_ADDRESS="firstname.lastname@gmail.com" # Change this to your email address! -export PANZA_USERNAME="${PANZA_EMAIL_ADDRESS%@*}" # Removes everything after @; for the example above, it will be firstname.lastname - -export PANZA_WORKSPACE=$(dirname "$(dirname "$(realpath "$0")")"); -export PANZA_DATA_DIR="$PANZA_WORKSPACE/data" # where data is stored -export PANZA_CHECKPOINTS="$PANZA_WORKSPACE/checkpoints" # where checkpoints are stored -export PANZA_FINETUNE_CONFIGS="$PANZA_WORKSPACE/src/panza/finetuning/configs" # where training configuration details are stored - -export PANZA_PREAMBLES="$PANZA_WORKSPACE/prompt_preambles" # this is where the system prompt and user prompt preambles can be accessed; you will need to edit these -export PANZA_SYSTEM_PREAMBLE_PATH="$PANZA_PREAMBLES/system_preamble.txt" # system prompt -# IMPORTANT: Please edit the user preamble (at the PANZA_USER_PREAMBLE_PATH) if you plan to use it (recommended). -export PANZA_USER_PREAMBLE_PATH="$PANZA_PREAMBLES/user_preamble.txt" # a useful preamble to the user instruction, explaining what's going on to the LLM -export PANZA_RAG_PREAMBLE_PATH="$PANZA_PREAMBLES/rag_preamble.txt" # a preamble for the RAG component -export PANZA_THREAD_PREAMBLE_PATH="$PANZA_PREAMBLES/thread_preamble.txt" # a preamble for the RAG component - -export PANZA_SUMMARIZATION_BATCH_SIZE=8 # batch size for summarization. -export PANZA_EVALUATION_BATCH_SIZE=1 # batch size for evaluation. Can safely be set to higher value (e.g., 8) if the GPU has enough capacity. - -export MODEL_PRECISION=bf16 # precision at which the base model is stored; options: bf16, fp32, or '4bit' -# export PANZA_GENERATIVE_MODEL="mistralai/Mistral-7B-Instruct-v0.2" -export PANZA_GENERATIVE_MODEL="ISTA-DASLab/Meta-Llama-3-8B-Instruct" -# export PANZA_GENERATIVE_MODEL="microsoft/Phi-3-mini-4k-instruct" - -lowercased=$(echo "$PANZA_GENERATIVE_MODEL" | tr '[:upper:]' '[:lower:]') -if [[ ${lowercased} == *llama* ]]; then - export MODEL_TYPE=llama3 -elif [[ ${lowercased} == *mistral* ]]; then - export MODEL_TYPE=mistralv2 -elif [[ ${lowercased} == *phi* ]]; then - export MODEL_TYPE=phi3 -else - echo "Model type ${PANZA_GENERATIVE_MODEL} not recognized! Panza only works with Mistral and Llama3 models. Exiting." - exit -fi - -export PANZA_EMBEDDING_MODEL="sentence-transformers/all-mpnet-base-v2" # embedding model for RAG; can be changed, trading off speed for quality - -export PANZA_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG - -export PANZA_SEED=42 # the one true seed - -export PANZA_FINETUNE_WITH_PREAMBLE=1 # states whether user and system preambles are used for fine-tuning; on by default -export PANZA_FINETUNE_WITH_RAG=0 # states whether RAG preambles are used for fine-tuning; off by default -export PANZA_FINETUNE_WITH_THREAD=0 # states whether the email thread is used for fine-tuning; off by default -export PANZA_FINETUNE_RAG_NUM_EMAILS=3 # maximum number of emails to use for RAG fine-tuning; 3 by default -export PANZA_FINETUNE_RAG_PROB=0.55 # probability of using RAG context for fine-tuning; 0.5 by default -export PANZA_FINETUNE_RAG_RELEVANCE_THRESHOLD=0.2 # emails whose relevance is above this threshold will be presented for RAG during fine-tuning -export PANZA_FINETUNE_THREAD_NUM_EMAILS=3 # maximum number of emails to use for thread fine-tuning; 3 by default -export PANZA_DISABLE_RAG_INFERENCE=0 # RAG inference is on by default, since it's usually better - -export PANZA_WANDB_DISABLED=True # disable Weights and Biases logging by default - -export PYTHONPATH="$PANZA_WORKSPACE/src:$PYTHONPATH" - -# Optionally, set your HF_HOME and/or TRANSFORMERS_CACHE here. -# export HF_HOME= -# export TRANSFORMERS_CACHE= diff --git a/scripts/eval_base_model.sh b/scripts/eval_base_model.sh deleted file mode 100755 index 9d7f97a..0000000 --- a/scripts/eval_base_model.sh +++ /dev/null @@ -1,102 +0,0 @@ -set -e - -source config.sh - -current_user=$(whoami) - -export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl - -# hyper-parameters with default values -export MASK_GEN_MODEL_PRECISION=${MODEL_PRECISION} # bf16, fp32, or 4bit -export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the checkpoints and generated masks -export NUM_EPOCHS=5 -export WARMUP=8 # the learning rate warmup (batches) -export BS=8 -export PER_DEVICE_BS=1 -export LORA_ALPHA=16 -export SCHEDULE=wl16 # the RoSA schedule -export SPA_NUM_GRADS=1 # number of gradients used for mask generation -export SPA_GRAD_ACC_MODE=mean_squared # 'mean' or 'mean_squared': how to accumulate gradients -export SEED=${PANZA_SEED} - - -export PANZA_RAG_RELEVANCE_THRESHOLD=0 # emails whose relevance is above this threshold will be presented for RAG - -if [[ ${MODEL_TYPE} == llama3 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -elif [[ ${MODEL_TYPE} == mistralv2 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -else - echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2 and llama3 models. Exiting." - exit -fi - -echo "Using Learning Rate ${LR} and LoRA LR ${LORA_LR} for ${MODEL_TYPE} model" - - -# hyper-parameters without default values -export SPA_DENSITY=0.01 # the sparse adapters' density -export LORA_R=8 # the low-rank adapters' rank - -export WANDB_PROJECT="panza-${current_user}" -export PRETRAINED=${PANZA_GENERATIVE_MODEL} -export CONFIG=${PANZA_FINETUNE_CONFIGS}/rosa_panza.yaml -export NUM_CPU_THREADS=0 # useful for running of CPU, 0 means default the used by torch - -export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" # if not set, default to 0 - -# take all the input arguments and put them in environment variables -# this could override the hyper-parameters defined above -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then - PREAMBLE_STR="PREAMBLE" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble -else - PREAMBLE_STR="" - PREPROCESSING_FN=panza.finetuningpreprocessing:panza_preprocessing_function -fi - -# some post-processing on the inputs - - - - -echo $RUN_NAME -# Running BLEU evaluation -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -# python ${EVAL_SCRIPT} \ -# --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ -# --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ -# --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ -# --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ -# --golden=${PANZA_DATA_DIR}/test.jsonl \ -# --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ -# --wandb-run-id=${WANDB_RUN_ID} \ -# ${USE_4BIT_QUANT} - - #--model=/nfs/scistore19/alistgrp/eiofinov/.cache/huggingface/hub/models--ISTA-DASLab--Meta-Llama-3-8B-Instruct/snapshots/0e6f530447ceec1aea4fd96e2aafad06bb3aa4b5/ \ -# Running BLEU evaluation with RAG -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --use-rag \ - ${USE_4BIT_QUANT} diff --git a/scripts/eval_model.sh b/scripts/eval_model.sh deleted file mode 100755 index 7e06e85..0000000 --- a/scripts/eval_model.sh +++ /dev/null @@ -1,34 +0,0 @@ -# Convenience script for running full finetuning. -# All arguments to the python script can be provided -# here exactly in the form they would be passed to the -# python script directly. -# -# Example usage: -# ./train_fft.sh user=alonso trainer.optimizer.lr=0.1 - -set -e - -vars=() -# Set a default for the required user argument. We'll override it -# later if provided. -vars[1]=$"user=$(whoami)" -idx=2 - -# process input arguments -for argument in "$@" -do - key=$(echo $argument | cut -f1 -d=) - - if [[ $key == user ]]; then - # We already set the default value here; change it now. - vars[1]=$argument - elif [[ $key == finetuning ]]; then - echo "The 'finetuning' argument is already set and should not be overridden here; override is ignored." - else - vars[idx]=$argument - idx+=1 - fi -done - -composer ../src/panza3/finetuning/train.py \ - finetuning=full ${vars[@]} \ No newline at end of file diff --git a/scripts/eval_rosa.sh b/scripts/eval_rosa.sh deleted file mode 100755 index 285df55..0000000 --- a/scripts/eval_rosa.sh +++ /dev/null @@ -1,114 +0,0 @@ -set -e - -source config.sh - -current_user=$(whoami) - -export DATA_PATH=${PANZA_DATA_DIR}/train.jsonl - -# hyper-parameters with default values -export MASK_GEN_MODEL_PRECISION=${MODEL_PRECISION} # bf16, fp32, or 4bit -export BASE_SAVE_PATH=${PANZA_CHECKPOINTS} # where to store the checkpoints and generated masks -export NUM_EPOCHS=5 -export WARMUP=8 # the learning rate warmup (batches) -export BS=8 -export PER_DEVICE_BS=1 -export LORA_ALPHA=16 -export SCHEDULE=wl16 # the RoSA schedule -export SPA_NUM_GRADS=1 # number of gradients used for mask generation -export SPA_GRAD_ACC_MODE=mean_squared # 'mean' or 'mean_squared': how to accumulate gradients -export SEED=${PANZA_SEED} - -if [[ ${MODEL_TYPE} == llama3 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -elif [[ ${MODEL_TYPE} == mistralv2 ]]; then - export LR=1e-5 # learning rate - export LORA_LR=1e-5 # a separate learning rate for the low-rank adapters -else - echo "Model type ${MODEL_TYPE} not recognized! Panza only works with mistralv2 and llama3 models. Exiting." - exit -fi - -echo "Using Learning Rate ${LR} and LoRA LR ${LORA_LR} for ${MODEL_TYPE} model" - - -# hyper-parameters without default values -export SPA_DENSITY=0.01 # the sparse adapters' density -export LORA_R=8 # the low-rank adapters' rank - -export WANDB_PROJECT="panza-${current_user}" -export PRETRAINED=${PANZA_GENERATIVE_MODEL} -export CONFIG=${PANZA_FINETUNE_CONFIGS}/rosa_panza.yaml -export NUM_CPU_THREADS=0 # useful for running of CPU, 0 means default the used by torch - -export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" # if not set, default to 0 - -# take all the input arguments and put them in environment variables -# this could override the hyper-parameters defined above -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -if [ "$PANZA_FINETUNE_WITH_PREAMBLE" = 1 ]; then - PREAMBLE_STR="PREAMBLE" - PREPROCESSING_FN=panza.finetuning.preprocessing:panza_preprocessing_function_train_with_preamble -else - PREAMBLE_STR="" - PREPROCESSING_FN=panza.finetuningpreprocessing:panza_preprocessing_function -fi - -# some post-processing on the inputs -export MAX_DURATION=${NUM_EPOCHS}ep - -# create directories to save the masks and models -mkdir -p ${BASE_SAVE_PATH}/masks/ -mkdir -p ${BASE_SAVE_PATH}/models/ - -if [ "$MODEL_PRECISION" = "bf16" ]; then - export ROSA_DTYPE=bf16 -elif [ "$MODEL_PRECISION" = "4bit" ]; then - export ROSA_DTYPE=fp32 -elif [ "$MODEL_PRECISION" = "fp32" ]; then - export ROSA_DTYPE=fp32 -else - echo "Unknown model precision $MODEL_PRECISION" - exit 1 -fi - - - -# Running BLEU evaluation -EVAL_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/evaluation.py -python ${EVAL_SCRIPT} \ - --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --golden=${PANZA_DATA_DIR}/test.jsonl \ - --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ - --wandb-run-id=${WANDB_RUN_ID} \ - ${USE_4BIT_QUANT} - -# # Running BLEU evaluation with RAG -# python ${EVAL_SCRIPT} \ -# --model=${BASE_SAVE_PATH}/models/${RUN_NAME} \ -# --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ -# --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ -# --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ -# --golden=${PANZA_DATA_DIR}/test.jsonl \ -# --batch-size=${PANZA_EVALUATION_BATCH_SIZE} \ -# --wandb-run-id=${WANDB_RUN_ID} \ -# --embedding-model=${PANZA_EMBEDDING_MODEL} \ -# --db-path=${PANZA_DATA_DIR} \ -# --index-name=${PANZA_USERNAME} \ -# --use-rag \ -# ${USE_4BIT_QUANT} - -echo "find the adapter at ${BASE_SAVE_PATH}/models/${RUN_NAME}" diff --git a/scripts/extract_emails.sh b/scripts/extract_emails.sh deleted file mode 100755 index 23f5300..0000000 --- a/scripts/extract_emails.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -source config.sh - -MBOX_NAME="Sent.mbox" -MBOX_PATH="${PANZA_DATA_DIR}/${MBOX_NAME}" - -python ../src/panza/data_preparation/extract_emails.py \ - --mbox-path=${MBOX_PATH} \ - --output-path=${PANZA_DATA_DIR} \ - --email=${PANZA_EMAIL_ADDRESS} \ \ No newline at end of file diff --git a/scripts/finetune.py b/scripts/finetune.py deleted file mode 100644 index 37f73da..0000000 --- a/scripts/finetune.py +++ /dev/null @@ -1,237 +0,0 @@ -import codecs -import logging -import os -import pty -import random -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path - -import hydra -import psutil -import torch -from omegaconf import DictConfig, OmegaConf - -from panza3 import PanzaWriter # The import also loads custom Hydra resolvers - -LOGGER = logging.getLogger(__name__) - - -def create_run_name(cfg: DictConfig) -> str: - # export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM - - run_name = f"panza_{cfg.user.username}" - - model_name = cfg.model.split("/")[-1] - run_name += f"-{model_name}" - - run_name += f"-{cfg.model_precision}" - run_name += f"-bs{cfg.batch_size}" - - if hasattr(cfg.finetuning, "rosa"): - run_name += "-rosa" - else: - run_name += "-fft" - - run_name += f"-lr{cfg.lr}" - run_name += f"-epochs{cfg.num_epochs}" - run_name += f"-seed{cfg.seed}" - run_name += f"-{random.randint(1e6, 1e7 - 1)}" - - return run_name - - -def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: - # Disable struct mode to allow modifications - rosa_cfg = cfg.finetuning.rosa - OmegaConf.set_struct(rosa_cfg, False) - - mask_path = str(Path(cfg.checkpoint_dir) / "masks" / cfg.finetuning.run_name) - - if mask_generation: - rosa_cfg.schedule = "wl16" if rosa_cfg.lora_r != 0 else "spa_only" - rosa_cfg.mask_load_path = None - rosa_cfg.mask_save_path = mask_path - rosa_cfg.terminate_after_mask_generation = True - else: - if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: - rosa_cfg.schedule = "default" - elif rosa_cfg.lora_r != 0: - rosa_cfg.schedule = "lora_only" - rosa_cfg.mask_load_path = None - else: - rosa_cfg.schedule = "spa_only" - - rosa_cfg.mask_load_path = mask_path - rosa_cfg.mask_save_path = None - rosa_cfg.terminate_after_mask_generation = None - - # Re-enable struct mode to lock down the configuration - OmegaConf.set_struct(rosa_cfg, True) - - -def create_experiment_yaml() -> str: - pass - - -def create_checkpoint_dirs(cfg: DictConfig) -> None: - # Create model directory - os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) - - # Create mask directory - if hasattr(cfg.finetuning, "rosa"): - os.makedirs(os.path.join(cfg.checkpoint_dir, "masks"), exist_ok=True) - - -def get_hf_save_precision(cfg: DictConfig) -> str: - if cfg.model_precision == "bf16": - return "bfloat16" - elif cfg.model_precision == "fp32": - return "float32" - else: - raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") - - -def get_rosa_dtype(cfg: DictConfig) -> str: - if cfg.model_precision == "bf16": - return "bg16" - elif cfg.model_precision == "fp32": - return "fp32" - elif cfg.model_precision == "4bit": - return "fp32" - else: - raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") - - -def override_config(cfg: DictConfig) -> None: - # Disable struct mode to allow modifications - OmegaConf.set_struct(cfg, False) - - cfg.finetuning.run_name = create_run_name(cfg) - - if hasattr(cfg.finetuning, "rosa"): - cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) - else: - cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) - - # Re-enable struct mode to lock down the configuration - OmegaConf.set_struct(cfg, True) - - -def save_config_to_yaml(cfg: DictConfig) -> str: - cfg = OmegaConf.to_container(cfg, resolve=True) - with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yaml") as temp_file: - OmegaConf.save(config=cfg, f=temp_file.name) - return temp_file.name - - -def launch_experiment(cfg: DictConfig, finetuning_yaml: str, prompt_builder_yaml: str) -> None: - def terminate_process_tree(pid: str): - try: - parent = psutil.Process(pid) - children = parent.children(recursive=True) - for child in children: - print("Terminating child process", child) - child.terminate() - psutil.wait_procs(children, timeout=5) - print("Terminating parent process", parent) - parent.terminate() - parent.wait(5) - except psutil.NoSuchProcess: - pass - - train_script = os.path.join(cfg.panza_workspace, "src/panza3/finetuning/train.py") - environment = os.environ.copy() - environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") - environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" - environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) - environment["PANZA_PREPROCESSING_CONFIG"] = prompt_builder_yaml - - print(finetuning_yaml) - print(train_script) - print(environment["PYTHONPATH"]) - command = f"composer {train_script} {finetuning_yaml}" - master, slave = pty.openpty() # Open a pseudo-terminal - with subprocess.Popen( - command, - stdout=slave, - stderr=subprocess.STDOUT, - text=True, - env=environment, - preexec_fn=os.setsid, - shell=True, - ) as process: - os.close(slave) # Close the slave descriptor - - # Set up a stream reader for the master end of the pty - try: - with codecs.getreader("utf-8")(os.fdopen(master, "rb")) as reader: - # Read and process output line by line - for line in reader: - print(line, end="") - - return process.returncode - except KeyboardInterrupt: - print("Killing process") - # os.killpg(os.getpgid(process.pid), subprocess.signal.SIGTERM) - terminate_process_tree(process.pid) - torch.cuda.empty_cache() - time.sleep(3) # Give some time for GPU resources to be released - - -def move_checkpoint_files(cfg: DictConfig) -> None: - # Move checkpoint files to the final directory - run_save_path = Path(cfg.hf_save_path) / "models" / cfg.finetuning.run_name - huggingface_dir = run_save_path / "huggingface" - last_save_dir_name = max(huggingface_dir.iterdir(), key=os.path.getmtime).name - - # Move the contents of the last saved directory to the run save path - source_dir = huggingface_dir / last_save_dir_name - for item in source_dir.iterdir(): - shutil.move(str(item), run_save_path) - - # Remove the now-empty huggingface directory - shutil.rmtree(huggingface_dir) - - -@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_finetuning") -def main(cfg: DictConfig) -> None: - LOGGER.info("Starting Panza Finetuning") - LOGGER.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) - - # Override configuration - override_config(cfg) - - create_checkpoint_dirs(cfg) - - # Launch training - preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) - - if "rosa" in cfg.finetuning: - # Generate masks - if cfg.finetuning.rosa.spa_d != 0: - override_rosa_schedule(cfg, mask_generation=True) - finetuning_yaml = save_config_to_yaml(cfg.finetuning) - # pdb.set_trace() - launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) - # RoSA finetuning - override_rosa_schedule(cfg, mask_generation=False) - finetuning_yaml = save_config_to_yaml(cfg.finetuning) - # pdb.set_trace() - launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) - else: - finetuning_yaml = save_config_to_yaml(cfg.finetuning) - launch_experiment(cfg, finetuning_yaml, preprocessing_yaml) - move_checkpoint_files(cfg) - - print( - "Find the finetuned model at", - os.path.join(cfg.hf_save_path, "models", cfg.finetuning.run_name), - ) - - -if __name__ == "__main__": - main() diff --git a/scripts/finetune_simple.py b/scripts/finetune_simple.py deleted file mode 100644 index 35c68eb..0000000 --- a/scripts/finetune_simple.py +++ /dev/null @@ -1,1016 +0,0 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 -import copy -import gc -import logging -import os -import random -import sys -import tempfile -import time -import warnings -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -import torch -from composer import Trainer -from composer.core.callback import Callback -from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy, InContextLearningLMAccuracy, - InContextLearningLMExpectedCalibrationError, - InContextLearningMCExpectedCalibrationError, - InContextLearningMultipleChoiceAccuracy, - InContextLearningQAAccuracy, LanguageCrossEntropy, - LanguagePerplexity) -from composer.optim import DecoupledAdamW -from composer.profiler import JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule -from composer.utils import dist, get_device, reproducibility -from llmfoundry import ComposerHFCausalLM -from llmfoundry.eval.metrics.nlp import InContextLearningMetric -from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP -from llmfoundry.models.utils import init_empty_weights -from llmfoundry.utils import find_mosaicml_logger, log_train_analytics, maybe_create_mosaicml_logger -from omegaconf import DictConfig, ListConfig -from omegaconf import OmegaConf as om -from peft import get_peft_model -from peft.tuners.rosa import RosaConfig, RosaModel, RosaScheduler -from rich.traceback import install -from torch.distributed.fsdp import FullStateDictConfig -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import StateDictType -from transformers import AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerBase - -install() -from llmfoundry.callbacks import AsyncEval -from llmfoundry.data.dataloader import build_dataloader -from llmfoundry.layers_registry import ffns_with_megablocks -from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, build_algorithm, build_callback, - build_composer_model, build_evaluators, build_logger, - build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, - update_batch_size_info) -from llmfoundry.utils.registry_utils import import_file - -import hydra -from omegaconf import DictConfig, OmegaConf - -from panza3 import PanzaWriter # The import also loads custom Hydra resolvers - -log = logging.getLogger(__name__) - - -def validate_config(cfg: DictConfig): - """Validates compatible model and dataloader selection.""" - loaders = [cfg.train_loader] - if 'eval_loader' in cfg: - eval_loader = cfg.eval_loader - if isinstance(eval_loader, ListConfig): - for loader in eval_loader: - if loader.label is None: - raise ValueError( - 'When specifying multiple evaluation datasets, each one must include the \ - `label` attribute.') - loaders.append(loader) - else: - loaders.append(eval_loader) - for loader in loaders: - if loader.name == 'text': - if cfg.model.name == 'hf_t5': - raise ValueError( - f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ - f'dataloader. Only finetuning is supported.') - - if 'icl_tasks' in cfg: - if cfg.model.name == 'hf_t5': - raise ValueError( - 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' - ) - - if (cfg.model.get('fc_type', 'torch') != 'te' and 'te' not in cfg.model.get( - 'ffn_config', {}).get('ffn_type', 'mptmlp') and - 'fp8' in cfg.precision): - warnings.warn( - "fp8 only supported for te.Linear layers. Either set `cfg.model.fc_typ='te'` or " - + - "`cfg.model.ffn_config.ffn_type='te_ln_mlp'` to enable layers using fp8 precision." - ) - - if (cfg.model.get('fc_type', 'torch') == 'te' or - 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): - fsdp_config = cfg.get('fsdp_config', None) - act_ckpt = fsdp_config.get('activation_checkpointing', False) - act_ckpt_reentrant = fsdp_config.get( - 'activation_checkpointing_reentrant', False) - if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: - warnings.warn( - '`te.Linear` layers do not support activation_checkpointing with ' - + '`activation_checkpointing_reentrant = True`. ' + - 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' - ) - cfg.fsdp_config.activation_checkpointing_reentrant = False - - if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': - warnings.warn( - '`te.LayerNormMLP` requires has issues with torch._dynamo. ' + - 'Setting `torch._dynamo.config.suppress_errors = True` and falling back to eager.' - ) - torch._dynamo.config.suppress_errors = True # type: ignore (third-party) - - if cfg.model.get('load_in_8bit', False): - raise ValueError( - '`load_in_8bit` is only supported for evaluation rather than training.' - ) - - if cfg.model.get('ffn_config', {}).get('ffn_type', - 'mptmlp') in ffns_with_megablocks: - moe_world_size = cfg.model.get('ffn_config', - {}).get('moe_world_size', 1) - use_orig_params = cfg.get('fsdp_config', - {}).get('use_orig_params', True) - if moe_world_size > 1 and not use_orig_params: - raise ValueError( - f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' - ) - - -def create_run_name(cfg: DictConfig) -> str: - # export RUN_NAME=panza_${PANZA_USERNAME}_${MODEL_TYPE}_${MODEL_PRECISION}-bs${BS}-fft-lr${LR}-epochs${NUM_EPOCHS}-wu${WARMUP}-seed${SEED}${PREAMBLE_STR}${RAFT_STR}-$RANDOM - - run_name = f"panza_{cfg.user.username}" - - model_name = cfg.model.split("/")[-1] - run_name += f"-{model_name}" - - run_name += f"-{cfg.model_precision}" - run_name += f"-bs{cfg.batch_size}" - - if hasattr(cfg.finetuning, "rosa"): - run_name += "-rosa" - else: - run_name += "-fft" - - run_name += f"-lr{cfg.lr}" - run_name += f"-epochs{cfg.num_epochs}" - run_name += f"-seed{cfg.seed}" - run_name += f"-{random.randint(1e6, 1e7 - 1)}" - - return run_name - - -def override_rosa_schedule(cfg: DictConfig, mask_generation=False) -> None: - # Disable struct mode to allow modifications - rosa_cfg = cfg.finetuning.rosa - OmegaConf.set_struct(rosa_cfg, False) - - mask_path = str(Path(cfg.checkpoint_dir) / "masks" / cfg.finetuning.run_name) - - if mask_generation: - rosa_cfg.schedule = "wl16" if rosa_cfg.lora_r != 0 else "spa_only" - rosa_cfg.mask_load_path = None - rosa_cfg.mask_save_path = mask_path - rosa_cfg.terminate_after_mask_generation = True - else: - if rosa_cfg.spa_d == 0 and rosa_cfg.lora_r != 0: - rosa_cfg.schedule = "default" - elif rosa_cfg.lora_r != 0: - rosa_cfg.schedule = "lora_only" - rosa_cfg.mask_load_path = None - else: - rosa_cfg.schedule = "spa_only" - - rosa_cfg.mask_load_path = mask_path - rosa_cfg.mask_save_path = None - rosa_cfg.terminate_after_mask_generation = None - - # Re-enable struct mode to lock down the configuration - OmegaConf.set_struct(rosa_cfg, True) - - -def create_experiment_yaml() -> str: - pass - - -def create_checkpoint_dirs(cfg: DictConfig) -> None: - # Create model directory - os.makedirs(os.path.join(cfg.checkpoint_dir, "models"), exist_ok=True) - - # Create mask directory - if hasattr(cfg.finetuning, "rosa"): - os.makedirs(os.path.join(cfg.checkpoint_dir, "masks"), exist_ok=True) - - -def get_hf_save_precision(cfg: DictConfig) -> str: - if cfg.model_precision == "bf16": - return "bfloat16" - elif cfg.model_precision == "fp32": - return "float32" - else: - raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") - - -def get_rosa_dtype(cfg: DictConfig) -> str: - if cfg.model_precision == "bf16": - return "bg16" - elif cfg.model_precision == "fp32": - return "fp32" - elif cfg.model_precision == "4bit": - return "fp32" - else: - raise ValueError(f"Unsupported model_precision: {cfg.model_precision}") - - -def override_config(cfg: DictConfig) -> None: - # Disable struct mode to allow modifications - OmegaConf.set_struct(cfg, False) - - cfg.finetuning.run_name = create_run_name(cfg) - - if hasattr(cfg.finetuning, "rosa"): - cfg.finetuning.rosa.rosa_dtype = get_rosa_dtype(cfg) - else: - cfg.finetuning.callbacks.hf_checkpointer.precision = get_hf_save_precision(cfg) - - # Re-enable struct mode to lock down the configuration - OmegaConf.set_struct(cfg, True) - - -def save_config_to_yaml(cfg: DictConfig) -> str: - cfg = OmegaConf.to_container(cfg, resolve=True) - with tempfile.NamedTemporaryFile("w", delete=False, suffix=".yaml") as temp_file: - OmegaConf.save(config=cfg, f=temp_file.name) - return temp_file.name - - -def build_composer_peft_model( - model_config: str, rosa_config: Dict[str, Any], - tokenizer: PreTrainedTokenizerBase, is_fsdp: bool = False) -> ComposerHFCausalLM: - - # 1) loads a hf model, 2) adds peft modules, 3) wraps it in a ComposerHFCausalLM. - print('Building model from HuggingFace checkpoint...') - - weight_bias_dtype = model_config.get('weight_bias_dtype', None) - if weight_bias_dtype == '4bit': - compute_dtype = torch.bfloat16 - quant_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=compute_dtype, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4', - ) - elif weight_bias_dtype == 'bf16': - assert weight_bias_dtype == 'bf16', 'Only bf16 is supported for now' - compute_dtype = torch.bfloat16 - quant_config = None - else: - assert weight_bias_dtype == 'fp32' - compute_dtype = torch.float32 - quant_config = None - - with init_empty_weights(include_buffers=False): - model = AutoModelForCausalLM.from_pretrained( - model_config.pretrained_model_name_or_path, - device_map='cpu' if quant_config is None else 'auto', - torch_dtype=compute_dtype, - # load_in_4bit=weight_bias_dtype == '4bit', - quantization_config=quant_config, - trust_remote_code=True, - use_auth_token=True, - use_cache=False, - attn_implementation='eager' - ) - - print('Model built!') - if rosa_config is not None: - print('Building RoSA config...') - config = RosaConfig( - r=rosa_config['lora_r'], - d=rosa_config['spa_d'], - lora_alpha=rosa_config.get('lora_alpha', 16), - target_modules=rosa_config.get('target_modules', 'all-linear'), - lora_dropout=rosa_config.get('lora_dropout', 0.05), - impl=rosa_config.get('impl', 'auto'), - spa_store_transpose=rosa_config.get('spa_store_transpose', True), - rosa_dtype=rosa_config.get('rosa_dtype', True), - spa_num_grads=rosa_config.get('spa_num_grads', 1), - grad_acc_mode=rosa_config.get('grad_acc_mode', 'mean_squared'), - grad_4bit_accum=rosa_config.get('grad_4bit_accum', False), - mask_load_path=rosa_config.get('mask_load_path', None), - mask_save_path=rosa_config.get('mask_save_path', None), - terminate_after_mask_generation=rosa_config.get('terminate_after_mask_generation', False), - schedule=rosa_config.get('schedule', 'df'), - bias="none", - task_type="CAUSAL_LM", - ) - print('Adding RoSA modules...') - model = get_peft_model(model, config) - print('RoSA modules added!') - - train_metrics = [LanguageCrossEntropy(), LanguagePerplexity()] - eval_metrics = [ - LanguageCrossEntropy(), - LanguagePerplexity(), - InContextLearningLMAccuracy(), - InContextLearningMultipleChoiceAccuracy(), - InContextLearningQAAccuracy(), - InContextLearningCodeEvalAccuracy(), - InContextLearningLMExpectedCalibrationError(), - InContextLearningMCExpectedCalibrationError() - ] - - model = HuggingFaceModelWithFSDP( - model=model, - shift_labels=True, - tokenizer=tokenizer, - metrics=train_metrics, - eval_metrics=eval_metrics, - init_device='cpu', - peft_config=None - ) - - # model = ComposerHFCausalLM(model, tokenizer) - # model = ModelComposerHFCausalLM(model, tokenizer) - return model - -@hydra.main(version_base="1.1", config_path="../configs", config_name="panza_finetuning") -def main(cfg: DictConfig) -> Trainer: - override_config(cfg) - - preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) - - create_checkpoint_dirs(cfg) - environment = os.environ - #environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") - environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" - environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) - environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml - environment["TOKENIZERS_PARALLELISM"] = "False" - - cfg = cfg.finetuning - - print("config is") - print(cfg) - OmegaConf.set_struct(cfg, False) - - - # Run user provided code if specified - code_paths = pop_config(cfg, - 'code_paths', - must_exist=False, - default_value=[], - convert=True) - # Import any user provided code - for code_path in code_paths: - import_file(code_path) - - # Filter deprecation warning from torch internal usage - warnings.filterwarnings( - action='ignore', - category=UserWarning, - message= - 'torch.distributed.*_base is a private function and will be deprecated.*' - ) - - # Check for incompatibilities between the model and data loaders - validate_config(cfg) - - # Resolve all interpolation variables as early as possible - om.resolve(cfg) - - # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(cfg) - - cuda_alloc_conf = [] - # Get max split size mb - max_split_size_mb: Optional[int] = cfg.pop('max_split_size_mb', None) - if max_split_size_mb is not None: - cuda_alloc_conf.append(f'max_split_size_mb:{max_split_size_mb}') - - # Expandable segments - if cfg.pop('expandable_segments', False): - cuda_alloc_conf.append('expandable_segments:True') - - if len(cuda_alloc_conf) > 0: - os.environ['PYTORCH_CUDA_ALLOC_CONF'] = ','.join(cuda_alloc_conf) - - # Set CUDA lazy loading - # This can save a bit of memory if not all modules are needed - cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False) - if cuda_load_lazy: - os.environ['CUDA_MODULE_LOADING'] = 'LAZY' - - # Set seed first - seed: int = pop_config(cfg, 'seed', must_exist=True) - reproducibility.seed_all(seed) - - # Initialize pytorch distributed training process groups - dist_timeout: Union[int, float] = pop_config(cfg, - 'dist_timeout', - must_exist=False, - default_value=600.0) - dist.initialize_dist(get_device(None), timeout=dist_timeout) - - # Get global and device batch size information from distributed/single node setting - cfg = update_batch_size_info(cfg) - logged_cfg.update(cfg, merge=True) - - # Mandatory model training configs - model_config: DictConfig = pop_config(cfg, 'model', must_exist=True) - tokenizer_config: Dict[str, Any] = pop_config(cfg, - 'tokenizer', - must_exist=True, - convert=True) - optimizer_config: Dict[str, Any] = pop_config(cfg, - 'optimizer', - must_exist=True, - convert=True) - scheduler_config: Dict[str, Any] = pop_config(cfg, - 'scheduler', - must_exist=True, - convert=True) - train_loader_config: DictConfig = pop_config(cfg, - 'train_loader', - must_exist=True) - - # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'fsdp_config', - must_exist=False, - default_value=None, - convert=True) - - ds_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'ds_config', - must_exist=False, - default_value=None, - convert=True) - - rosa_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'rosa', - must_exist=False, - default_value=None, - convert=True) - - hf_save_path: Union[int, str] = pop_config(cfg, - 'hf_save_path', - must_exist=True) - - eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( - cfg, 'eval_loader', must_exist=False, default_value=None) - icl_tasks_config: Optional[Union[ListConfig, - str]] = pop_config(cfg, - 'icl_tasks', - must_exist=False, - default_value=None) - eval_gauntlet_config: Optional[Union[DictConfig, - str]] = pop_config(cfg, - 'eval_gauntlet', - must_exist=False, - default_value=None) - icl_subset_num_batches: Optional[int] = pop_config(cfg, - 'icl_subset_num_batches', - must_exist=False, - default_value=None) - icl_seq_len: Optional[int] = pop_config(cfg, - 'icl_seq_len', - must_exist=False, - default_value=None) - # Optional logging, evaluation and callback configs - logger_configs: Optional[DictConfig] = pop_config(cfg, - 'loggers', - must_exist=False, - default_value=None, - convert=True) - callback_configs: Optional[DictConfig] = pop_config(cfg, - 'callbacks', - must_exist=False, - default_value=None, - convert=True) - algorithm_configs: Optional[DictConfig] = pop_config(cfg, - 'algorithms', - must_exist=False, - default_value=None) - - # Mandatory hyperparameters for training - device_train_batch_size: int = pop_config(cfg, - 'device_train_batch_size', - must_exist=True) - device_eval_batch_size: int = pop_config(cfg, - 'device_eval_batch_size', - must_exist=True) - max_duration: Union[int, str] = pop_config(cfg, - 'max_duration', - must_exist=True) - eval_interval: Union[int, str] = pop_config(cfg, - 'eval_interval', - default_value=1, - must_exist=False) - precision: str = pop_config(cfg, 'precision', must_exist=True) - max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) - - # Optional parameters will be set to default values if not specified. - default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name: str = pop_config(cfg, - 'run_name', - must_exist=False, - default_value=default_run_name) - save_folder: Optional[str] = pop_config(cfg, - 'save_folder', - must_exist=False, - default_value=None) - is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') - == 'sharded') if fsdp_config else False - save_latest_filename: str = pop_config( - cfg, - 'save_latest_filename', - must_exist=False, - default_value='latest-sharded-rank{rank}' - if is_state_dict_sharded else 'latest-rank{rank}.pt') - save_overwrite: bool = pop_config(cfg, - 'save_overwrite', - must_exist=False, - default_value=False) - save_weights_only: bool = pop_config(cfg, - 'save_weights_only', - must_exist=False, - default_value=False) - save_filename: str = pop_config( - cfg, - 'save_filename', - must_exist=False, - default_value='ep{epoch}-ba{batch}-rank{rank}.pt') - save_interval: Union[str, int] = pop_config(cfg, - 'save_interval', - must_exist=False, - default_value='1000ba') - save_num_checkpoints_to_keep: int = pop_config( - cfg, 'save_num_checkpoints_to_keep', must_exist=False, default_value=-1) - progress_bar = pop_config(cfg, - 'progress_bar', - must_exist=False, - default_value=False) - log_to_console: bool = pop_config(cfg, - 'log_to_console', - must_exist=False, - default_value=True) - python_log_level: Optional[str] = pop_config(cfg, - 'python_log_level', - must_exist=False, - default_value='debug') - console_log_interval: Union[int, str] = pop_config(cfg, - 'console_log_interval', - must_exist=False, - default_value='1ba') - device_train_microbatch_size: Union[str, int] = pop_config( - cfg, - 'device_train_microbatch_size', - must_exist=False, - default_value='auto') - eval_subset_num_batches: int = pop_config(cfg, - 'eval_subset_num_batches', - must_exist=False, - default_value=-1) - eval_first: bool = pop_config(cfg, - 'eval_first', - must_exist=False, - default_value=False) - load_path: str = pop_config(cfg, - 'load_path', - must_exist=False, - default_value=None) - load_weights_only: bool = pop_config(cfg, - 'load_weights_only', - must_exist=False, - default_value=False) - load_strict_model_weights: bool = pop_config(cfg, - 'load_strict_model_weights', - must_exist=False, - default_value=True) - load_ignore_keys: Optional[List[str]] = pop_config(cfg, - 'load_ignore_keys', - must_exist=False, - default_value=None) - save_ignore_keys: Optional[List[str]] = pop_config(cfg, - 'save_ignore_keys', - must_exist=False, - default_value=None) - compile_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'compile_config', - must_exist=False, - default_value=None) - metadata: Optional[Dict[str, str]] = pop_config(cfg, - 'metadata', - must_exist=False, - default_value=None, - convert=True) - should_log_config: bool = pop_config(cfg, - 'log_config', - must_exist=False, - default_value=True) - - num_cpu_threads: Optional[int] = cfg.pop('num_cpu_threads', 0) - if num_cpu_threads > 0: - print(f'Setting number of CPU threads to {num_cpu_threads}') - import spops - torch.set_num_threads(num_cpu_threads) - spops.set_num_threads(num_cpu_threads) - - # Enable autoresume from model checkpoints if possible - autoresume_default: bool = False - if logged_cfg.get('run_name', None) is not None \ - and save_folder is not None \ - and not save_overwrite \ - and not save_weights_only: - autoresume_default = True - - if cfg.get('autoresume') is None and autoresume_default: - log.info('As run_name, save_folder, and save_latest_filename are set, \ - changing autoresume default to True...') - - autoresume: bool = pop_config(cfg, - 'autoresume', - must_exist=False, - default_value=autoresume_default) - - # Pop known unused parameters that are used as interpolation variables or - # created by update_batch_size_info. - pop_config(cfg, 'data_local', must_exist=False) - pop_config(cfg, 'data_remote', must_exist=False) - pop_config(cfg, 'global_seed', must_exist=False) - pop_config(cfg, 'global_train_batch_size', must_exist=False) - pop_config(cfg, 'n_gpus', must_exist=False) - pop_config(cfg, 'device_train_grad_accum', must_exist=False) - - assert fsdp_config is None or ds_config is None, 'fsdp and deepspeed are not supported together' - - # Warn users for unused parameters - for key in cfg: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' - ) - - # Warn if fsdp is enabled but user only has 1 GPU - if dist.get_world_size() == 1 and fsdp_config is not None: - warnings.warn( - 'FSDP is not applicable for single-GPU training. Reverting to DDP.') - fsdp_config = None - - # set logging level - if python_log_level is not None: - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' - ) - logging.getLogger('llmfoundry').setLevel( - python_log_level.upper()) # Foundry module - logging.getLogger(__name__).setLevel( - python_log_level.upper()) # Train script - - # Initialize context - init_context = process_init_device(model_config, fsdp_config) - logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) - - # Build tokenizer - log.info('Building tokenizer...') - tokenizer_name = tokenizer_config['name'] - tokenizer_kwargs = tokenizer_config.get('kwargs', {}) - tokenizer_kwargs["num_proc"] = 1 - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - # Scheduler - scheduler_name: str = scheduler_config.pop('name') - scheduler = build_scheduler(scheduler_name, scheduler_config) - - # Loggers - loggers = [ - build_logger(str(name), logger_cfg) - for name, logger_cfg in logger_configs.items() - ] if logger_configs else [] - - mosaicml_logger = find_mosaicml_logger(loggers) - if mosaicml_logger is None: - mosaicml_logger = maybe_create_mosaicml_logger() - if mosaicml_logger is not None: - # mosaicml_logger will be None if run isn't on MosaicML platform - loggers.append(mosaicml_logger) - - if metadata is not None: - # Flatten the metadata for logging - logged_cfg.pop('metadata', None) - logged_cfg.update(metadata, merge=True) - if mosaicml_logger is not None: - mosaicml_logger.log_metrics(metadata) - mosaicml_logger._flush_metadata(force_flush=True) - - # Profiling - profiler: Optional[Profiler] = None - profiler_cfg: Optional[DictConfig] = pop_config(cfg, - 'profiler', - must_exist=False, - convert=False, - default_value=None) - if profiler_cfg: - profiler_schedule_cfg: Dict = pop_config(profiler_cfg, - 'schedule', - must_exist=True, - convert=True) - profiler_schedule = cyclic_schedule(**profiler_schedule_cfg) - # Only support json trace handler - profiler_trace_handlers: List[TraceHandler] = [] - profiler_trace_cfg: Optional[Dict] = pop_config(profiler_cfg, - 'json_trace_handler', - must_exist=False, - default_value=None, - convert=True) - if profiler_trace_cfg: - profiler_trace_handlers.append( - JSONTraceHandler(**profiler_trace_cfg)) - profiler = Profiler(**profiler_cfg, - trace_handlers=profiler_trace_handlers, - schedule=profiler_schedule) - - # Callbacks - callbacks: List[Callback] = [ - build_callback(str(name), callback_cfg, om.to_container(logged_cfg)) - for name, callback_cfg in callback_configs.items() - ] if callback_configs else [] - - use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) - - print('ROSA CONFIG', rosa_config) - # Build Model - print('Initializing model...') - with init_context: - assert fsdp_config is None or rosa_config is None, 'fsdp is cuurently not supported with RoSA' - model = build_composer_peft_model(model_config, rosa_config, tokenizer, is_fsdp=fsdp_config is not None) - if rosa_config is not None: - assert isinstance(model.model.base_model, RosaModel) - - # Algorithms - algorithms = [ - build_algorithm(str(name), algorithm_cfg) - for name, algorithm_cfg in algorithm_configs.items() - ] if algorithm_configs else [] - - if rosa_config is not None: - algorithms.append(RosaScheduler(model.model.base_model)) - - # Dataloaders - log.info('Building train loader...') - from datasets import disable_caching - from streaming.base.util import clean_stale_shared_memory - clean_stale_shared_memory() - try: - disable_caching() - - train_loader = build_dataloader( - train_loader_config, - tokenizer, - device_train_batch_size, - ) - except Exception as e: - if mosaicml_logger is not None: - mosaicml_logger.log_exception(e) - raise e - - if mosaicml_logger is not None: - mosaicml_logger.log_metrics({'data_validated': time.time()}) - - ## Evaluation - if use_async_eval: - evaluators = [] - if eval_first: - warnings.warn( - 'AsyncEval callback does not support eval_first=True. Ignoring.' - ) - eval_first = False - - else: - log.info('Building eval loader...') - eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len - evaluators, _, eval_gauntlet_callback = build_evaluators( - eval_loader_config, - icl_tasks_config, - eval_gauntlet_config, - tokenizer=tokenizer, - device_eval_batch_size=device_eval_batch_size, - icl_seq_len=eval_icl_seq_len, - icl_subset_num_batches=icl_subset_num_batches, - ) - if eval_gauntlet_callback is not None: - callbacks.append(eval_gauntlet_callback) - - if mosaicml_logger is not None: - log_train_analytics(mosaicml_logger, model_config, train_loader_config, - eval_loader_config, callback_configs, - tokenizer_name, load_path, icl_tasks_config, - eval_gauntlet_config) - # # Build Model - # log.info('Initializing model...') - # model = build_composer_model( - # name=model_config.name, - # cfg=model_config, - # tokenizer=tokenizer, - # init_context=init_context, - # master_weights_dtype=model_config.get('master_weights_dtype', None), - # ) - - # Log number of parameters - if hasattr(model, 'n_total_params'): - n_params = model.n_total_params - n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. - else: - n_params = sum(p.numel() for p in model.parameters()) - n_trainable_params = sum( - p.numel() for p in model.parameters() if p.requires_grad) - if hasattr(model, 'n_active_params'): - n_active_params = model.n_active_params - else: - n_active_params = n_params - logged_cfg.update({ - 'n_params': n_params, - 'n_active_params': n_active_params, - 'n_trainable_params': n_trainable_params, - }) - - # Optimizer - optimizer_name: str = optimizer_config.pop('name') - if rosa_config is None or 'lora_lr' not in rosa_config: - optimizer = build_optimizer(model, optimizer_name, optimizer_config) - else: - print(f'Using a different learning rate for lora params {rosa_config["lora_lr"]}') - assert optimizer_name == 'decoupled_adamw' - lora_params = [] - other_params = [] - for name, param in model.named_parameters(): - if any([k in name for k in ['rosa_A', 'rosa_B', 'rosa_embedding_A', 'rosa_embedding_B']]): - lora_params.append(param) - else: - other_params.append(param) - - print(f'Found {len(lora_params)} lora params and {len(other_params)} other params') - params = [ - {'params': other_params}, - {'params': lora_params, 'lr': rosa_config['lora_lr']} - ] - optimizer = DecoupledAdamW(params, **optimizer_config) - - - - # Now add the eval metrics - try: - if eval_loader_config is not None and not use_async_eval: - eval_metrics = model.get_metrics(is_train=False) - non_icl_metrics = [ - metric_name for metric_name, metric in eval_metrics.items() - if not isinstance(metric, InContextLearningMetric) - ] - evaluators = add_metrics_to_eval_loaders(evaluators, - non_icl_metrics) - except Exception as e: - if mosaicml_logger is not None: - mosaicml_logger.log_exception(e) - raise e - - # Build the Trainer - log.info('Building trainer...') - trainer = Trainer( - run_name=run_name, - seed=seed, - model=model, - train_dataloader=train_loader, - eval_dataloader=evaluators, - optimizers=optimizer, - schedulers=scheduler, - max_duration=max_duration, - eval_interval=eval_interval, - eval_subset_num_batches=eval_subset_num_batches, - progress_bar=progress_bar, - log_to_console=log_to_console, - console_log_interval=console_log_interval, - loggers=loggers, - callbacks=callbacks, - precision=precision, - algorithms=algorithms, - device_train_microbatch_size=device_train_microbatch_size, - fsdp_config=fsdp_config, - deepspeed_config=ds_config, - save_folder=save_folder, - save_filename=save_filename, - save_latest_filename=save_latest_filename, - save_interval=save_interval, - save_num_checkpoints_to_keep=save_num_checkpoints_to_keep, - save_overwrite=save_overwrite, - save_weights_only=save_weights_only, - load_path=load_path, - load_weights_only=load_weights_only, - load_strict_model_weights=load_strict_model_weights, - load_ignore_keys=load_ignore_keys, - save_ignore_keys=save_ignore_keys, - autoresume=autoresume, - python_log_level=python_log_level, - dist_timeout=dist_timeout, - profiler=profiler, - compile_config=compile_config, - ) - - if should_log_config: - log.info('Logging config') - log_config(logged_cfg) - torch.cuda.empty_cache() - gc.collect() - - # Eval first if requested - if eval_first and trainer.state.timestamp.batch.value == 0: - trainer.eval() - - log.info('Starting training...') - trainer.fit() - - # if rosa is enabled, save the model manually, since - # llm-foundry's checkpointing doesn't work properly with RoSA - if rosa_config is not None: - assert fsdp_config is None, 'fsdp is cuurently not supported with RoSA' - path_to_save = os.path.join(hf_save_path, run_name) - print(f'saving the model to {path_to_save}') - if torch.distributed.get_rank() == 0: - model.model.save_pretrained(path_to_save, is_main_process=True, state_dict=model.model.state_dict()) - tokenizer.save_pretrained(path_to_save) - - # print('Saving directly into HF-friendly format') - - # path_to_save = os.path.join(hf_save_path, run_name) - # print('saving the model.') - # if fsdp_config is None: - # model.model.save_pretrained(path_to_save, is_main_process=torch.distributed.get_rank() == 0, state_dict=model.model.state_dict()) - # else: - # with FSDP.summon_full_params(model.model, writeback=False, rank0_only=True, offload_to_cpu=True): - # model_to_save = model.model - # model_to_save.save_pretrained(path_to_save, state_dict=model_to_save.state_dict()) - - # if torch.distributed.get_rank() == 0: - # tokenizer.save_pretrained(path_to_save) - - # # NOTE: for some reason the saving code above would create empty pytorch_model.bin file, so we delete it manually - # # TODO: figure out why this happens - # if torch.distributed.get_rank() == 0 and os.path.exists(os.path.join(path_to_save, "pytorch_model.bin")): - # tmp = torch.load(os.path.join(path_to_save, "pytorch_model.bin")) - # if not tmp: # empty dict, remove it - # os.remove(os.path.join(path_to_save, "pytorch_model.bin")) - - log.info('Done.') - return trainer - - -PY = None -FY = None - -def do_thing(cfg:DictConfig) -> List[str]: - # Override configuration - override_config(cfg) - - create_checkpoint_dirs(cfg) - - # Launch training - print("HEEEeEre") - preprocessing_yaml = save_config_to_yaml(cfg.preprocessing) - finetuning_yaml = save_config_to_yaml(cfg.finetuning) - print(preprocessing_yaml, finetuning_yaml) - PY = preprocessing_yaml - FY = finetuning_yaml - #return "hellooooo", "lol" - #return [preprocessing_yaml, finetuning_yaml] - - -if __name__ == '__main__': - # yaml_path, args_list = sys.argv[1], sys.argv[2:] - - # # Disable resolving environment variables through omegaconf. - # om.clear_resolver('oc.env') - - # #if get_local_rank == 0: - # log.info("Starting Panza Finetuning") - # print("hello") - # with open(yaml_path) as f: - # print("loading") - # cfg = om.load(f) - #raise ValueError(cfg) - #if os.getenv("LOCAL_RANK", '0') == "0": - # do_thing() - # raise ValueError(PY) - # print(preprocessing_yaml, finetuning_yaml) - # environment = os.environ.copy() - # environment["PYTHONPATH"] = os.path.join(cfg.panza_workspace, "src") - # environment["WANDB_PROJECT"] = f"panza-{cfg.user.username}" - # environment["WANDB_DISABLED"] = str(int(cfg.wandb_disabled)) - # environment["PANZA_PREPROCESSING_CONFIG"] = preprocessing_yaml - # sys.exit() - # #log.info("Configuration: \n%s", OmegaConf.to_yaml(cfg, resolve=True)) - # # Load yaml and cli arguments. - # with open(yaml_path) as f: - # yaml_cfg = om.load(f) - # cli_cfg = om.from_cli(args_list) - # cfg = om.merge(yaml_cfg, cli_cfg) - # om.resolve(cfg) - # assert isinstance(cfg, DictConfig) - main() diff --git a/scripts/output.tx b/scripts/output.tx deleted file mode 100644 index e69de29..0000000 diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py index 62bd033..d279ea5 100644 --- a/scripts/prepare_data.py +++ b/scripts/prepare_data.py @@ -11,11 +11,11 @@ from omegaconf import DictConfig, OmegaConf from tqdm import tqdm -from panza3 import PanzaWriter # The import also loads custom Hydra resolvers -from panza3.entities import Document, Email, SummarizationInstruction -from panza3.retriever import DocumentRetriever -from panza3.data_preparation.extract_emails import extract_emails -from panza3.data_preparation.rag import create_vector_store +from panza import PanzaWriter # The import also loads custom Hydra resolvers +from panza.entities import Document, Email, SummarizationInstruction +from panza.retriever import DocumentRetriever +from panza.data_preparation.extract_emails import extract_emails +from panza.data_preparation.rag import create_vector_store LOGGER = logging.getLogger(__name__) diff --git a/scripts/prepare_dataset.sh b/scripts/prepare_dataset.sh deleted file mode 100755 index 3e0799f..0000000 --- a/scripts/prepare_dataset.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -source config.sh - -TRAIN_RATIO=1.0 -SPLIT_TYPE="chronological" # random or chronological - -CHUNK_SIZE=3000 -CHUNK_OVERLAP=3000 - -LOAD_IN_4BIT=0 -RUN_FP32=0 - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - - - -USE_4BIT_QUANT=$([ "${LOAD_IN_4BIT}" = 1 ] && echo "--load-in-4bit" || echo "") -USE_FP32_COMPUTE=$([ "${RUN_FP32}" = 1 ] && echo "--fp32" || echo "") - -# Create synthetic instructions (summaries) for emails -python ../src/panza/data_preparation/summarize_emails.py \ - --path-to-emails="${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean.jsonl" \ - --prompt-file="${PANZA_WORKSPACE}/src/panza/data_preparation/summarization_prompt.txt" \ - --batch-size=${PANZA_SUMMARIZATION_BATCH_SIZE} ${USE_4BIT_QUANT} ${USE_FP32_COMPUTE} && - -if [[ $TRAIN_RATIO < 1.0 ]]; then - # Create train and test splits - SPLIT_PANZA_DATA_DIR=${PANZA_DATA_DIR}/split - - python ../src/panza/data_preparation/split_data.py \ - --data-path="${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ - --output-data-dir=${PANZA_DATA_DIR}/split \ - --train-ratio=${TRAIN_RATIO} \ - --split-type=${SPLIT_TYPE} \ - --seed=${PANZA_SEED} - - PANZA_DATA_DIR=$SPLIT_PANZA_DATA_DIR -else - cp "${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ - "${PANZA_DATA_DIR}/train.jsonl" - - # Finetuning requires some sort of test set, just use the training - # data again. - cp "${PANZA_DATA_DIR}/${PANZA_USERNAME}_clean_summarized.jsonl" \ - "${PANZA_DATA_DIR}/test.jsonl" -fi - -# Create vector store with emails embeddings -# Note that if the data is split, then the PANZA_DATA_DIR, -# where the vector store will be, will be the /split directory. -python ../src/panza/data_preparation/create_vector_store.py \ - --path-to-emails="${PANZA_DATA_DIR}/train.jsonl" \ - --chunk-size=${CHUNK_SIZE} \ - --chunk-overlap=${CHUNK_OVERLAP} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --embedding_model=${PANZA_EMBEDDING_MODEL} diff --git a/scripts/prepare_train_eval.sh b/scripts/prepare_train_eval.sh new file mode 100755 index 0000000..c88782f --- /dev/null +++ b/scripts/prepare_train_eval.sh @@ -0,0 +1,63 @@ +# Convenience script for combining all data preparation, model training +# and model evaluation with json +# All arguments to the python script can be provided +# here exactly in the form they would be passed to the +# python script directly. +# +# Example usage: +# CUDA_VISIBLE_DEVICES=x ./prepare_train_eval.sh user=alonso finetuning=rosa + +set -e + +vars=() +# Set a default for the required user argument. We'll override it +# later if provided. +vars[1]=$"user=$(whoami)" +idx=2 + +# process input arguments +training_mode="tbd" # training_mode to be determined later. +test_split="0" +for argument in "$@" +do + key=$(echo $argument | cut -f1 -d=) + if [[ $key == user ]]; then + # We already set the default value here; change it now. + vars[1]=$argument + echo "Overriding user to be ${argument#*=}" + elif [[ $key == test_split ]]; then + test_split=${argument#*=} + echo "Setting the test_split to ${test_split}" + elif [[ $key == finetuning ]]; then + training_mode=${argument#*=} + echo "Setting finetuning mode to ${training_mode}" + elif [[ $training_mode == "rosa" ]] && [[ $key == finetuning.rosa.masks_only ]];then + echo "The 'finetuning.rosa.masks_only' argument is already set and should not be overridden here; override is ignored." + else + vars[idx]=$argument + idx+=1 + fi +done + +# Step 1. Prepare the data +python ./prepare_data.py ${vars[@]} +# Step 2 & 3 Combined. Determine the type of training to do and evaluate with json. +if [[ $training_mode == "rosa" ]]; then + # First create the masks for RoSA finetuning. + composer ../src/panza/finetuning/train.py \ + finetuning=rosa finetuning.rosa.masks_only=true ${vars[@]} + # Then train the weights. + composer ../src/panza/finetuning/train.py \ + finetuning=rosa finetuning.rosa.masks_only=false ${vars[@]} + if [[ $test_split != "0" ]]; then + echo "Generating json evaluation" + python runner.py interfaces=json writer/llm=peft + fi +elif [[ $training_mode == "full" ]]; then + composer ../src/panza/finetuning/train.py \ + finetuning=full ${vars[@]} + if [[ $test_split != "0" ]]; then + echo "Generating json evaluation" + python runner.py interfaces=json writer/llm=transformers + fi +fi \ No newline at end of file diff --git a/scripts/run_panza_cli.sh b/scripts/run_panza_cli.sh deleted file mode 100755 index 5a094db..0000000 --- a/scripts/run_panza_cli.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL=${PANZA_GENERATIVE_MODEL} # Replace this with the checkpoint you want to use! - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/console_interactive_inference.py -python ${INFERENCE_SCRIPT} \ - --model=${MODEL} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} \ - ${USE_4BIT_QUANT} diff --git a/scripts/runner.py b/scripts/runner.py index e0728a3..445c50b 100644 --- a/scripts/runner.py +++ b/scripts/runner.py @@ -5,7 +5,7 @@ import os from omegaconf import DictConfig, OmegaConf -from panza3 import PanzaWriter # The import also loads custom Hydra resolvers +from panza import PanzaWriter # The import also loads custom Hydra resolvers LOGGER = logging.getLogger(__name__) diff --git a/scripts/train_fft.sh b/scripts/train_fft.sh index 7e06e85..92f8327 100755 --- a/scripts/train_fft.sh +++ b/scripts/train_fft.sh @@ -30,5 +30,5 @@ do fi done -composer ../src/panza3/finetuning/train.py \ +composer ../src/panza/finetuning/train.py \ finetuning=full ${vars[@]} \ No newline at end of file diff --git a/scripts/train_rosa.sh b/scripts/train_rosa.sh index 07d1e03..b8c5997 100755 --- a/scripts/train_rosa.sh +++ b/scripts/train_rosa.sh @@ -33,9 +33,9 @@ do done # First create the masks for RoSA finetuning. -composer ../src/panza3/finetuning/train.py \ +composer ../src/panza/finetuning/train.py \ finetuning=rosa finetuning.rosa.masks_only=true ${vars[@]} # Then train the weights. -composer ../src/panza3/finetuning/train.py \ +composer ../src/panza/finetuning/train.py \ finetuning=rosa finetuning.rosa.masks_only=false ${vars[@]} diff --git a/src/panza/__init__.py b/src/panza/__init__.py index e69de29..bdee682 100644 --- a/src/panza/__init__.py +++ b/src/panza/__init__.py @@ -0,0 +1,55 @@ +from omegaconf import OmegaConf + +from .prompting.utils import load_preamble, load_user_preamble + +OmegaConf.register_new_resolver("load_preamble", load_preamble) +OmegaConf.register_new_resolver("load_user_preamble", load_user_preamble) + +from .writer import PanzaWriter + +__all__ = ["PanzaWriter"] + +PANZA_ASCII_LOGO = """ . . . . . . + ... . . . . . . . + . . . . . =%[ :+. . + . . . .~@% +@( + . .. ~<: . . . >@^.@) . + . . :}: . . *@@@@@{^=-. + . ={@@~ . . . .)@@@@@@@@@#= . + *%@@@@^ . . ~^^ >@@@@@@@@@%(=. + (@%@@@@@@@@[ =#@@@@@@@@@@@#= -}@@@@@@]..*= + ^@@@@@@@@@@%.. :<#@@@@@@@@@@@{= ..:}@@@@@-) .. + ~@@@@@@@@@@@-:[@@@@@@@@@@@@@@%+ . =#@@@@@+* .. . + :@@@@@@#^:.^*>#@%#{@@@@@@@@@@@# ..+-.^@@@@@@<-- + .}@@@[+ -( . .<@@@@@@@@@@^ ^%[=)@@@@@@~<( + . .. +@*. *}@@@@@@@@@@@@()>+)@@+=%@@@@@+}{= . + {@@#%@@@@@@@@@@@@@@@%{@@@%+]@@@@@@+ + . =@@@@@@@@@@@@@@@@@@@@@{#{*.(@@@@@@* + . -@][[#@@@@@@@@@@@@@@@@@).=#@@@@@@* + +)}[{}* #..-@@@@@@@@@@@@@@@@@@%>(@@@@@@@^ + . ~[@@@@@@^:=#{#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@<. . + ~#@@@@@@#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@% + .:)@@@@(:~%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@* + ~{@#<. <@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@#.. .. + == (@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@> . . + . .>@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@] . + . :%@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@(. + . ^@@@@@@@@@@@@@@@@@@@@@@@@@@@@@~ + ... ~@@@@@@@@@@@@@@@@@@@@@@@@@[~ . . + . . . >@@@@@@@}=:=+^>^+@@@@@@@[: . . +. ..>@@@@@}- +@@@@@]: . .. +. :#<<( -{](> . + . . ..(]>}+~~~=======+}(((~-::... . .. + .:~=+^>)][[}{#%%@@@@@@@@@@@@@@@@@%%#{}}[])<>*+=-:. . + . . . . . .. . . . """ + +PANZA_ASCII_TEXT = """.______ ___ .__ __. ________ ___ +| _ \ / \ | \ | | | / / \ +| |_) | / ^ \ | \| | `---/ / / ^ \ +| ___/ / /_\ \ | . ` | / / / /_\ \ +| | / _____ \ | |\ | / /----./ _____ \ +| _| /__/ \__\ |__| \__| /________/__/ \__\ + """ + +print(PANZA_ASCII_LOGO) +print(PANZA_ASCII_TEXT) diff --git a/src/panza/data_preparation/create_vector_store.py b/src/panza/data_preparation/create_vector_store.py deleted file mode 100644 index 20b11c2..0000000 --- a/src/panza/data_preparation/create_vector_store.py +++ /dev/null @@ -1,61 +0,0 @@ -import argparse -import json -import time -from typing import List - - - -from panza.utils import rag -from panza.utils.documents import Email - - - -def load_emails(path: str) -> List[Email]: - with open(path, "r") as f: - lines = f.readlines() - - emails = [Email.deserialize(line) for line in lines] - - return emails - - - -def main(): - parser = argparse.ArgumentParser() - - parser = argparse.ArgumentParser(description="Store emails in a embeddings vector DB.") - parser.add_argument("--path-to-emails", help="Path to the cleaned emails") - parser.add_argument("--chunk-size", type=int, default=3000) - parser.add_argument("--chunk-overlap", type=int, default=3000) - parser.add_argument("--db-path", type=str) - parser.add_argument("--index-name", type=str) - parser.add_argument( - "--embedding_model", type=str, default="sentence-transformers/all-mpnet-base-v2" - ) - - args = parser.parse_args() - - # Load emails - emails = load_emails(args.path_to_emails) - print(f"Loaded {len(emails)} emails.") - - # Process emails - documents = process_emails(emails, args.chunk_size, args.chunk_overlap) - print(f"Obtained {len(documents)} text chuncks.") - - # Initialize embeddings model - embeddings_model = rag.get_embeddings_model(args.embedding_model) - - # Create vector DB - print("Creating vector DB...") - start = time.time() - db = rag.create_vector_db(documents, embeddings_model) - print(f"Vector DB created in {time.time() - start} seconds.") - - # Save vector DB to disk - db.save_local(folder_path=args.db_path, index_name=args.index_name) - print(f"Vector DB index {args.index_name} saved to {args.db_path}.") - - -if __name__ == "__main__": - main() diff --git a/src/panza/data_preparation/extract_emails.py b/src/panza/data_preparation/extract_emails.py index 12f1d79..e9ead5e 100644 --- a/src/panza/data_preparation/extract_emails.py +++ b/src/panza/data_preparation/extract_emails.py @@ -1,10 +1,11 @@ -import argparse import json import mailbox import re from email.utils import parsedate_to_datetime +from email.message import Message +from mailbox import mboxMessage from os import makedirs -from os.path import join +from os.path import join, dirname import langdetect @@ -19,6 +20,8 @@ SHORT_EMAIL_THRESHOLD = 10 # words +FORWARDED_MESSAGE_TAG = "---------- Forwarded message ---------" + def extract_only_plain_text(msg_part): if msg_part.get_content_type() == "text/plain": @@ -28,7 +31,7 @@ def extract_only_plain_text(msg_part): def skip_forwarded_messages(plain_text): - if "---------- Forwarded message ---------" in plain_text: + if FORWARDED_MESSAGE_TAG in plain_text: DISCARDED_EMAILS["forwarded"].append(plain_text) return "" else: @@ -42,7 +45,7 @@ def remove_date_time(email_body): match = pattern.search(email_body) if match: - return (email_body[:match.start()] + email_body[match.end():]).strip() + return (email_body[: match.start()] + email_body[match.end() :]).strip() else: return email_body @@ -61,17 +64,17 @@ def count_words(s): def extract_by_quote_level(text): # Split the text into lines - lines = text.split('\n') + lines = text.split("\n") # Dictionary to store lines by quote level grouped_lines = {} for line in lines: # Count the number of '>' at the start of the line - quote_level = len(re.match(r'^>*', line).group()) + quote_level = len(re.match(r"^>*", line).group()) # Remove leading '>' and spaces - clean_line = re.sub(r'^>*\s*', '', line) + clean_line = re.sub(r"^>*\s*", "", line) # Add the clean line to the appropriate group if quote_level not in grouped_lines: @@ -99,7 +102,7 @@ def filter_message(msg): email_with_thread = [remove_date_time(an_email) for an_email in email_with_thread] main_email = email_with_thread.pop(0) - email_with_thread.reverse() # chronological order + email_with_thread.reverse() # chronological order # check length before detecting language if count_words(main_email) < SHORT_EMAIL_THRESHOLD: @@ -121,20 +124,10 @@ def filter_message(msg): return (main_email.strip(), [an_email.strip() for an_email in email_with_thread]) -def main(): - parser = argparse.ArgumentParser(description="Process an MBOX file for PANZA project.") - parser.add_argument("--mbox-path", help="Path to the MBOX file.") - parser.add_argument("--output-path", help="Path to the directory to save the output files.") - parser.add_argument( - "--email", - action="append", - help="Email address(es) to filter the messages. Use the argument multiple times for multiple emails.", - ) - parser.add_argument("--save-discarded-emails", action="store_true") - args = parser.parse_args() +def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_emails_path): - MBOX_PATH = args.mbox_path - EMAIL = args.email + MBOX_PATH = mailbox_path + EMAIL = email_addresses mbox = mailbox.mbox(MBOX_PATH) n_emails = len(mbox) @@ -142,20 +135,38 @@ def main(): print(f"--> processing {i}/{n_emails} <--") # Filter messages sent from your email address if message["from"] and any(email in message["from"] for email in EMAIL): - date = parsedate_to_datetime(message["Date"]).isoformat() + if message["Date"]: + date = parsedate_to_datetime(message["Date"]).isoformat() + else: + print("Date was not found in the email. Skipping.") + continue if message.is_multipart(): for part in message.walk(): filtered_msg = filter_message(part) if filtered_msg is not None: print(filtered_msg) main_email, thread = filtered_msg - CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + CLEAN_EMAILS.append( + { + "email": main_email, + "thread": thread, + "subject": message["Subject"], + "date": date, + } + ) else: filtered_msg = filter_message(message) if filtered_msg is not None: print(filtered_msg) main_email, thread = filtered_msg - CLEAN_EMAILS.append({"email": main_email, "thread": thread, "subject": message["Subject"], "date": date}) + CLEAN_EMAILS.append( + { + "email": main_email, + "thread": thread, + "subject": message["Subject"], + "date": date, + } + ) print(f"\n---> [Cleaning stats] <---") print(f"# clean emails = {len(CLEAN_EMAILS)}") @@ -171,26 +182,27 @@ def main(): first_email = EMAIL[0] username = first_email[: first_email.find("@")] - makedirs(args.output_path, exist_ok=True) + makedirs(dirname(output_path), exist_ok=True) # Save clean emails - with open(join(args.output_path, username + "_clean.jsonl"), "w", encoding="utf-8") as f: + with open(join(output_path), "w", encoding="utf-8") as f: for item in CLEAN_EMAILS: json_record = json.dumps(item) f.write(json_record + "\n") # Save discarded emails - if args.save_discarded_emails: - makedirs(join(args.output_path, "discarded"), exist_ok=True) + if save_discarded_emails_path and save_discarded_emails_path != "": + print(f"\n---> Processing Discarded Emails <---") + makedirs(save_discarded_emails_path, exist_ok=True) for k, v in DISCARDED_EMAILS.items(): - output_path = join( - args.output_path, "discarded", username + "_discarded_" + k + ".jsonl" - ) + print(f"--> processing {k} emails <--") + output_path = join(save_discarded_emails_path, f"{username}_discarded_{k}.jsonl") with open(output_path, "w", encoding="utf-8") as f: - for item in v: + discarded_emails = len(v) + for i, item in enumerate(v): + print("\n\n\n\n\===========================") + if type(item) is Message or type(item) is mboxMessage: + item = item.get_payload() + print(f"--> processing {i}/{discarded_emails} <--") json_record = json.dumps(item) f.write(json_record + "\n") - - -if __name__ == "__main__": - main() diff --git a/src/panza3/data_preparation/rag.py b/src/panza/data_preparation/rag.py similarity index 100% rename from src/panza3/data_preparation/rag.py rename to src/panza/data_preparation/rag.py diff --git a/src/panza3/entities/__init__.py b/src/panza/entities/__init__.py similarity index 100% rename from src/panza3/entities/__init__.py rename to src/panza/entities/__init__.py diff --git a/src/panza3/entities/document.py b/src/panza/entities/document.py similarity index 100% rename from src/panza3/entities/document.py rename to src/panza/entities/document.py diff --git a/src/panza3/entities/instruction.py b/src/panza/entities/instruction.py similarity index 100% rename from src/panza3/entities/instruction.py rename to src/panza/entities/instruction.py diff --git a/src/panza/evaluation/.evaluate_summaries.py.swp b/src/panza/evaluation/.evaluate_summaries.py.swp deleted file mode 100644 index 6787e235bdb151d90c2b5baaf71c4a24745a56ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHOO^h5z6>fq_$R9)@_<#f|_eM;Qm}&2iiNnG~TE`n>ylXFx?T8GkPS14BY_mPx zov!L#j~5e(V-PBaa5WbM->a(bo|*kgKpYVC zNT0g9Ue&8tuU@^b_o}Bif9?D_`-D4h;rEbb{p^v?3|3xgAOH3vr!5{PgD_5Zh;-C1 z@Wtc9G+#Wb4$;0;Fg&^8$2a_SKa5U_tba14r;~igk8)pzNnG4=PvoPKpM_kw>GXG zf&V=X$n-Aj1q|yhGqmT-^<6X9cbn&yx%xA7^H^VM4b&Q_HBf7y)XGf&W4c_o9)=ZuWMd%tB#;9=mW@3X9DfKLH`c+j%;fV04p zz!_i>_$crp;342{;BOCD)?a|PfLp*zzz&cDJ>V+vY2X}i=YGq26L$|{r01tQqm;?TCw`Khmcme1GJ>W^;QQ#in=kKwsuK=G1y1;3G0k@7>)+@lbfG-0V z00tZfJ_y`G66Kq~3UC_uJCZK1178OcpaW1MhUVgSk$^Gt@eVHRfM3jaJwC#R7Ocbi zNji1|&QrQ^RB6+iZ4(ZdqI7L_u{T+#!`Lv10v>BlC1rW4@gT{js|qr?w{6NNE%h0Ma#X|x*6CZq1Dkh-O-*qoA1 z6@kaV&A1<^C#M?xZYZ}{lJeN`v!QT}yf=8DM=ZQS81WXfGrP%r!3J7R9tl1z@xF%W zamZty`4ZoTC8lERbr766q(rQ87NiR4cRga*~gfY zSk>}Gc!O8NCXIxoj?9mT+?j8&2m_qu(`?OhXfDHQ&2L&3QayzCPW3|BQT|D6RW0v|b z^A6in!=GS#)1I1yS#nAAh2WWF3BJYxCs*+njv^OIRVrek?)+jBLr6tWvoMzXjZe{L zO%ft@_eTD>$Gwcl{;2p2Z0u||mv_`k-)^y;%E7pyRGyjCJ*T%HaUxA@i9_;zY`DTdbE$HdNMH8Y*nflZkP~6SivWhI)dGZ7SQXx_5aT z2olb$Mj}H2sh`1;Va2$3RAyG6DffebO7jti%|hdPskCXW+AX%lqqJkM=+%S?q++HN zpzATouS+ulhcG0!*$(42IZ|+h1^Ia-J5TxWK=8-v&Csz0d=dBXrr<|&Q_jCl&L6y# z!^2uDUH7rmf=;;gnx2IN?QA$!GTl3y&QwOTc%+eksRv>wX^i%kxLK z*HhPq>{K%0aE3fmfG$`G6&{o?VQA}uz`bIOr7*>Rnnq#Q7*4)9xuw&4?6v z#IcF%3zrs++lzQ!4B*V#>eZ+FSOI$Y`zvQjKE!rU_j~!$)pM&8)y>DmC7o7WDI^A8 zNJU%7cvw84Gs7<_Ymjp)N@?R<+!xk>jrkmz3B7R0!AIEZ#gd8C$C^v9*NAB@;Mm5+ zI%nrLuCy0j(@cU9mjd+ymgF+crGN{-gL#Sev31HN2Qh}P#9pV(ulu7EKBwo4P`K<0 zy;Z_d7~w-{Ii_Ys6ZAmD^_8pFR+%7OvJv+Mwn~Bp*cW6Wye&4Nlg%A748dU-B>#oA zb^w12A>l-ttjFap$6f+MH3NV#!r&mF&{oN#aWS^IJ4ZtHS*Mg8HJCD(qmKeZc4#-kn!hps2`kj0g-4AEV2I; zm=zi*>9tt`d|`b9b&Io)p6N6Ja|ac zqFvY|5p1FQG!Y_%g_CkjC!1sW!0WIJG z;4Q@ZzX5&({1EVg4WI@57P0*ofj!_jKr#NSz>C1=fHrVHa363RG5j}xi@-W?8aM{r zMlAnTAO|Gi0&harH-J9^uK^_ct3ds$HBf7y)Xc61PDbO9e8-y5XabeV&|=Myx8_)f9(09ABIIhjyyH?l$}#L z2CZ@(rV)~`CM6W!(D_c2AaW%V0WH?pZK$-dsXmnvo0Ou;*{D&ez(~v2Rl7B$;t@Q# zB(c-Yx(!v>*i@Tbq1PaO5b>=;hXO8A1e zpW;LyaLOZ;O0C4cEUqSp{=mvrGIAoDTPhYQOQI|NQt0X_br-qIDw$g`wrT2gH9Ci3 z)#Gd@WgFBw@|Z`>y=IB3>d7esnwE)Os)1HLqIPL}RO)ogW}7Y6)uG3xK8GPX4YNx( z3kZF2S=5&$eL1xWBeh74^eZ``6PqEemtaAgQ>Y@>y6F~C5O>l%?t9V)+a`{Kc9Cmu zx+0B2m}e!r3B!=>oMovgCM`U+-zjIEJ*MeYqR{}0P_jHI(z9p`2&3uhMwP%QL)`6- z{B106cphgD>uH%$x=AOtu||opN=vxZ4uLX-t~ZUoNSCL;rA&`Fu}496;hl)SbP#<> zt47VmLG;CnzHktILDMOAMW7vpy$t#3BGn*N>SH3Jr3e*cijmluNDEM08YJpq0^vDg zX=Ja5IFrCS^COA00a=62>vUL^(lM)P4rHm*|P+48phhqa+z}rwUWxz! diff --git a/src/panza/evaluation/base_inference.py b/src/panza/evaluation/base_inference.py deleted file mode 100644 index 8f421e9..0000000 --- a/src/panza/evaluation/base_inference.py +++ /dev/null @@ -1,117 +0,0 @@ -import argparse -import os -import sys - -import torch - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.utils import prompting -from panza.utils.documents import Email -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import AutoPeftModelForCausalLM - -sys.path.pop(0) - - -def get_base_inference_args_parser(): - parser = argparse.ArgumentParser() - - parser.add_argument("--model", default=None) - parser.add_argument("--system-preamble", type=str, default=None) - parser.add_argument("--user-preamble", type=str, default=None) - parser.add_argument("--rag-preamble", type=str, default=None) - parser.add_argument("--thread-preamble", type=str, default=None) - parser.add_argument("--best", action="store_true", default=False) - parser.add_argument("--temperature", type=float, default=0.7) - parser.add_argument("--top-k", type=int, default=50) - parser.add_argument("--top-p", type=float, default=0.7) - parser.add_argument("--max-new-tokens", type=int, default=1024) - parser.add_argument("--use-rag", action="store_true", default=False) - parser.add_argument("--rag-relevance-threshold", type=float, default=0.2) - parser.add_argument( - "--embedding-model", type=str, default="sentence-transformers/all-mpnet-base-v2" - ) - parser.add_argument("--db-path", type=str, default=None) - parser.add_argument("--index-name", type=str, default=None) - parser.add_argument("--rag-num-emails", type=int, default=7) - parser.add_argument("--device", type=str, default="cuda:0") - parser.add_argument("--dtype", type=str, default="bf16") - parser.add_argument("--nthreads", type=int, default=None) - parser.add_argument("--load-in-4bit", default=False, action="store_true") - - return parser - - -def load_model_and_tokenizer(model_path, device, dtype, load_in_4bit): - - if os.path.exists(os.path.join(model_path, "adapter_config.json")): - print("found an adapter.") - if load_in_4bit: - model = AutoPeftModelForCausalLM.from_pretrained( - model_path, device_map=device, quantization_config=quant_config, trust_remote_code=True - ) - else: - model = AutoPeftModelForCausalLM.from_pretrained( - model_path, torch_dtype=dtype, device_map=device, trust_remote_code=True - ) - model = model.merge_and_unload() - else: - if load_in_4bit: - model = AutoModelForCausalLM.from_pretrained( - model_path, device_map=device, quantization_config=quant_config, trust_remote_code=True - ) - else: - model = AutoModelForCausalLM.from_pretrained( - model_path, torch_dtype=dtype, device_map=device, trust_remote_code=True - ) - - tokenizer = AutoTokenizer.from_pretrained( - model_path, model_max_length=model.config.max_position_embeddings - ) - tokenizer.padding_side = "left" - tokenizer.pad_token = tokenizer.eos_token - - return model, tokenizer - - -def run_inference( - instructions, - model, - tokenizer, - system_preamble, - user_preamble, - rag_preamble, - rag_relevance_threshold, - rag_num_emails, - thread_preamble, - use_rag, - db, - max_new_tokens, - best, - temperature, - top_k, - top_p, - device, -): - batch = [] - prompts = [] - for instruction, thread in instructions: - relevant_emails = [] - if use_rag: - assert db is not None, "RAG requires a database to be provided." - re = db._similarity_search_with_relevance_scores( - instruction, k=rag_num_emails - ) - relevant_emails = [ - Email.deserialize(r[0].metadata["serialized_email"]) - for r in re - if r[1] >= rag_relevance_threshold - ] - - prompt = prompting.create_prompt( - instruction, system_preamble, user_preamble, rag_preamble, relevant_emails, thread_preamble, thread, - ) - prompts.append(prompt) - messages = [{"role": "user", "content": prompt}] - batch.append(messages) diff --git a/src/panza/evaluation/console_interactive_inference.py b/src/panza/evaluation/console_interactive_inference.py deleted file mode 100644 index 92bbd38..0000000 --- a/src/panza/evaluation/console_interactive_inference.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import sys - -import torch - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -sys.path.pop(0) - - -def main(): - parser = base_inference.get_base_inference_args_parser() - args = parser.parse_args() - - print("Running inference with args:", args) - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - print("Loading model ", args.model) - model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble - ) - - while True: - user_input = input("Enter another request (or 'quit' to exit): ") - - if user_input.lower() == "quit": - print("Exiting...") - break - - prompts, outputs = base_inference.run_inference( - instructions=[(user_input, None)], - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - thread_preamble=None, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - - print("Processed input:", prompts[0]) - print("Generated email", outputs[0]) - - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/evaluate b/src/panza/evaluation/evaluate deleted file mode 100644 index a56617a..0000000 --- a/src/panza/evaluation/evaluate +++ /dev/null @@ -1,180 +0,0 @@ -# We conduct evaluations with three scores. -# The BLEU score is frequently used to evaluate translations and compares n-grams in a 'golden' -# translation to those in a candidate translation. Multiple golden translations are possible. -# The ROUGE score is frequently used for translation and summarization; it also looks at -# n-gram similarity. It is actually several scores, since precision, recall, and F1 score are -# reported separately. -# The MAUVE score measures distribution similarity (in the sense of KL-divergence) between the -# targets and outputs, and is not computed on a per-example basis. The similarity is computed -# in the latent space of an LLM, by default GPT-2. - - -import json -import os -import re -import string -import sys - -from evaluate import load -from torchmetrics.text.rouge import ROUGEScore -from torchmetrics.text.bleu import BLEUScore - -import numpy as np -import torch -import wandb - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -sys.path.pop(0) - - -def main(): - parser = base_inference.get_base_inference_args_parser() - parser.add_argument("--responses-per-prompt", type=int, default=1) - parser.add_argument("--golden", type=str, default=None) - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--wandb-run-id", type=str, default=None) - args = parser.parse_args() - - rouge = ROUGEScore() - # This library computes the BLEU score components separately. We do not use a length penalty. - bleu1 = BLEUScore(n_gram=1) - bleu2 = BLEUScore(n_gram=2) - bleu3 = BLEUScore(n_gram=3) - bleu4 = BLEUScore(n_gram=4) - mauve = load('mauve') - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - print("Loading model ", args.model) - model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble - ) - - with open(args.golden, "r") as f: - golden_lines = [json.loads(l) for l in f.readlines()] - - grouped_golden = {} - for entry in golden_lines: - if entry["summary"] in grouped_golden: - grouped_golden[entry["summary"]]["templates"].append(entry["email"]) - else: - grouped_golden[entry["summary"]] = {} - grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] - - print("Evaluating with batch size", args.batch_size) - - results = {} - all_results = [] - prompt_scores = {} - outputs_logs = {} - grouped_golden = list(grouped_golden.items()) - for i in range(0, len(grouped_golden), args.batch_size): - batch = grouped_golden[i:i + args.batch_size] - prompts = [item[0] for item in batch] - golden_responses = [item[1]["templates"] for item in batch] - - #prompt_scores = [[] for _ in range(len(prompts))] - for _ in range(args.responses_per_prompt): - full_prompts, outputs = base_inference.run_inference( - instructions=prompts, - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - - # Remove some boilerplate added by instruction-tuned models w/out finetuning. - outputs = [o.replace("Here is the email:\n", "") for o in outputs] - outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] - for j, prompt in enumerate(prompts): - # We clean up the strings for the BLEU and ROUGE scores. - punc_table = str.maketrans({key: None for key in string.punctuation}) - golden = [" ".join(x.translate(punc_table).lower().split()) for x in golden_responses[j]] - candidate = " ".join(outputs[j].translate(punc_table).lower().split()) - - rouge_score = rouge(outputs[j], golden_responses[j]) - bleu_score = np.mean([bleu([candidate], [golden]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) - rouge_score = rouge(candidate, golden) - if prompt not in prompt_scores.keys(): - prompt_scores[prompt] = {"prompt": prompt, "full_prompt": full_prompts[j], - "golden" : golden_responses[j], "output": [outputs[j]], - "BLEU": [bleu_score.item()]} - for score, value in rouge_score.items(): - prompt_scores[prompt][score] = [value.item()] - else: - prompt_scores[prompt]["output"].append(outputs[j]) - prompt_scores[prompt]["BLEU"].append(bleu_score.item()) - for score, value in rouge_score.items(): - prompt_scores[prompt][score].append(value.item()) - - print("\n-----------\n", "PROMPT:\n", prompt, "\n\nOUTPUT:\n", outputs[j], "\n\nBLEU SCORE:\n", bleu_score, "\n\nROUGE SCORE:\n", rouge_score) - - - means = {} - mins = {} - score_names = [k for k in prompt_scores.values().__iter__().__next__().keys() if 'BLEU' in k or 'rouge' in k] - - for k in score_names: - means[k] = np.mean([v for scores in prompt_scores.values() for v in scores[k] ]) - mins[k] = np.min([v for scores in prompt_scores.values() for v in scores[k] ]) - - # To compute the MAUVE score, we need equal-length flat arrays of - # outputs and goldens. If we have multiple outputs per prompt, we - # output them all, with the same golden prompt. - # TODO: not sure if it would be better to randomly sample from the - # outputs in this case. - # TODO: consider handling the case where there are also multiple golden - # queries per output. (We don't use this for anything now). - flattened_golden = [] - flattened_outputs = [] - for prompt_info in prompt_scores.values(): - flattened_golden += ([prompt_info["golden"][0]])*len(prompt_info['output']) - flattened_outputs += prompt_info['output'] - mauve_score = mauve.compute(predictions=flattened_outputs, references=flattened_golden) - print("MAUVE score", mauve_score) - means["MAUVE"] = mauve_score.mauve - print("Mean scores across all prompts: ", {f" {k}: {v}" for k, v in means.items()}) - - - # Optionally, update wandb run with eval scores - rag_str = "RAG-" if args.use_rag else "" - if args.wandb_run_id: - with wandb.init(id=args.wandb_run_id, resume=True): - wandb.log({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) - wandb.log({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) - else: - print({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) - print({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) - - with open(os.path.join(args.model, f"{rag_str}eval_responses.txt"), 'w') as f: - json.dump(prompt_scores, f, ensure_ascii=False, indent=4) - - with open(os.path.join(args.model, f"{rag_str}eval_summary.txt"), 'w') as f: - json.dump({"means": means, "mins": mins}, f, ensure_ascii=False, indent=4) - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/evaluate_backup.py b/src/panza/evaluation/evaluate_backup.py deleted file mode 100644 index a56617a..0000000 --- a/src/panza/evaluation/evaluate_backup.py +++ /dev/null @@ -1,180 +0,0 @@ -# We conduct evaluations with three scores. -# The BLEU score is frequently used to evaluate translations and compares n-grams in a 'golden' -# translation to those in a candidate translation. Multiple golden translations are possible. -# The ROUGE score is frequently used for translation and summarization; it also looks at -# n-gram similarity. It is actually several scores, since precision, recall, and F1 score are -# reported separately. -# The MAUVE score measures distribution similarity (in the sense of KL-divergence) between the -# targets and outputs, and is not computed on a per-example basis. The similarity is computed -# in the latent space of an LLM, by default GPT-2. - - -import json -import os -import re -import string -import sys - -from evaluate import load -from torchmetrics.text.rouge import ROUGEScore -from torchmetrics.text.bleu import BLEUScore - -import numpy as np -import torch -import wandb - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -sys.path.pop(0) - - -def main(): - parser = base_inference.get_base_inference_args_parser() - parser.add_argument("--responses-per-prompt", type=int, default=1) - parser.add_argument("--golden", type=str, default=None) - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--wandb-run-id", type=str, default=None) - args = parser.parse_args() - - rouge = ROUGEScore() - # This library computes the BLEU score components separately. We do not use a length penalty. - bleu1 = BLEUScore(n_gram=1) - bleu2 = BLEUScore(n_gram=2) - bleu3 = BLEUScore(n_gram=3) - bleu4 = BLEUScore(n_gram=4) - mauve = load('mauve') - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - print("Loading model ", args.model) - model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble - ) - - with open(args.golden, "r") as f: - golden_lines = [json.loads(l) for l in f.readlines()] - - grouped_golden = {} - for entry in golden_lines: - if entry["summary"] in grouped_golden: - grouped_golden[entry["summary"]]["templates"].append(entry["email"]) - else: - grouped_golden[entry["summary"]] = {} - grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] - - print("Evaluating with batch size", args.batch_size) - - results = {} - all_results = [] - prompt_scores = {} - outputs_logs = {} - grouped_golden = list(grouped_golden.items()) - for i in range(0, len(grouped_golden), args.batch_size): - batch = grouped_golden[i:i + args.batch_size] - prompts = [item[0] for item in batch] - golden_responses = [item[1]["templates"] for item in batch] - - #prompt_scores = [[] for _ in range(len(prompts))] - for _ in range(args.responses_per_prompt): - full_prompts, outputs = base_inference.run_inference( - instructions=prompts, - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - - # Remove some boilerplate added by instruction-tuned models w/out finetuning. - outputs = [o.replace("Here is the email:\n", "") for o in outputs] - outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] - for j, prompt in enumerate(prompts): - # We clean up the strings for the BLEU and ROUGE scores. - punc_table = str.maketrans({key: None for key in string.punctuation}) - golden = [" ".join(x.translate(punc_table).lower().split()) for x in golden_responses[j]] - candidate = " ".join(outputs[j].translate(punc_table).lower().split()) - - rouge_score = rouge(outputs[j], golden_responses[j]) - bleu_score = np.mean([bleu([candidate], [golden]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) - rouge_score = rouge(candidate, golden) - if prompt not in prompt_scores.keys(): - prompt_scores[prompt] = {"prompt": prompt, "full_prompt": full_prompts[j], - "golden" : golden_responses[j], "output": [outputs[j]], - "BLEU": [bleu_score.item()]} - for score, value in rouge_score.items(): - prompt_scores[prompt][score] = [value.item()] - else: - prompt_scores[prompt]["output"].append(outputs[j]) - prompt_scores[prompt]["BLEU"].append(bleu_score.item()) - for score, value in rouge_score.items(): - prompt_scores[prompt][score].append(value.item()) - - print("\n-----------\n", "PROMPT:\n", prompt, "\n\nOUTPUT:\n", outputs[j], "\n\nBLEU SCORE:\n", bleu_score, "\n\nROUGE SCORE:\n", rouge_score) - - - means = {} - mins = {} - score_names = [k for k in prompt_scores.values().__iter__().__next__().keys() if 'BLEU' in k or 'rouge' in k] - - for k in score_names: - means[k] = np.mean([v for scores in prompt_scores.values() for v in scores[k] ]) - mins[k] = np.min([v for scores in prompt_scores.values() for v in scores[k] ]) - - # To compute the MAUVE score, we need equal-length flat arrays of - # outputs and goldens. If we have multiple outputs per prompt, we - # output them all, with the same golden prompt. - # TODO: not sure if it would be better to randomly sample from the - # outputs in this case. - # TODO: consider handling the case where there are also multiple golden - # queries per output. (We don't use this for anything now). - flattened_golden = [] - flattened_outputs = [] - for prompt_info in prompt_scores.values(): - flattened_golden += ([prompt_info["golden"][0]])*len(prompt_info['output']) - flattened_outputs += prompt_info['output'] - mauve_score = mauve.compute(predictions=flattened_outputs, references=flattened_golden) - print("MAUVE score", mauve_score) - means["MAUVE"] = mauve_score.mauve - print("Mean scores across all prompts: ", {f" {k}: {v}" for k, v in means.items()}) - - - # Optionally, update wandb run with eval scores - rag_str = "RAG-" if args.use_rag else "" - if args.wandb_run_id: - with wandb.init(id=args.wandb_run_id, resume=True): - wandb.log({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) - wandb.log({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) - else: - print({f"EVAL/{k}-{rag_str}mean": v for k, v in means.items()}) - print({f"EVAL/{k}-{rag_str}min": v for k, v in mins.items()}) - - with open(os.path.join(args.model, f"{rag_str}eval_responses.txt"), 'w') as f: - json.dump(prompt_scores, f, ensure_ascii=False, indent=4) - - with open(os.path.join(args.model, f"{rag_str}eval_summary.txt"), 'w') as f: - json.dump({"means": means, "mins": mins}, f, ensure_ascii=False, indent=4) - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/evaluation.py b/src/panza/evaluation/evaluation.py deleted file mode 100644 index 5882f65..0000000 --- a/src/panza/evaluation/evaluation.py +++ /dev/null @@ -1,194 +0,0 @@ -# We conduct evaluations with three scores. -# The BLEU score is frequently used to evaluate translations and compares n-grams in a 'golden' -# translation to those in a candidate translation. Multiple golden translations are possible. -# The ROUGE score is frequently used for translation and summarization; it also looks at -# n-gram similarity. It is actually several scores, since precision, recall, and F1 score are -# reported separately. -# The MAUVE score measures distribution similarity (in the sense of KL-divergence) between the -# targets and outputs, and is not computed on a per-example basis. The similarity is computed -# in the latent space of an LLM, by default GPT-2. - -import json -import os -import re -import string -import sys - -import numpy as np -import torch -import wandb -from evaluate import load -from torchmetrics.text.bleu import BLEUScore -from torchmetrics.text.rouge import ROUGEScore - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -sys.path.pop(0) - - -def main(): - parser = base_inference.get_base_inference_args_parser() - parser.add_argument("--responses-per-prompt", type=int, default=1) - parser.add_argument("--golden", type=str, default=None) - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--use-thread", action="store_true", default=False) - parser.add_argument("--wandb-run-id", type=str, default=None) - args = parser.parse_args() - - rouge = ROUGEScore() - # This library computes the BLEU score components separately. We do not use a length penalty. - bleu1 = BLEUScore(n_gram=1) - bleu2 = BLEUScore(n_gram=2) - bleu3 = BLEUScore(n_gram=3) - bleu4 = BLEUScore(n_gram=4) - mauve = load('mauve') - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - print("Loading model ", args.model) - model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble, thread_preamble = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble - ) - - with open(args.golden, "r") as f: - golden_lines = [json.loads(l) for l in f.readlines()] - - grouped_golden = {} - for entry in golden_lines: - if entry["summary"] in grouped_golden: - grouped_golden[entry["summary"]]["templates"].append(entry["email"]) - else: - grouped_golden[entry["summary"]] = {} - grouped_golden[entry["summary"]]["templates"] = [(entry["email"])] - grouped_golden[entry["summary"]]["thread"] = entry["thread"] - - print("Evaluating with batch size", args.batch_size) - - results = {} - all_results = [] - prompt_scores = {} - outputs_logs = {} - grouped_golden = list(grouped_golden.items()) - for i in range(0, len(grouped_golden), args.batch_size): - batch = grouped_golden[i:i + args.batch_size] - prompts = [item[0] for item in batch] - if args.use_thread: - threads = [item[1]["thread"] for item in batch] - golden_responses = [item[1]["templates"] for item in batch] - - #prompt_scores = [[] for _ in range(len(prompts))] - for _ in range(args.responses_per_prompt): - if args.use_thread: - instructions = list(zip(prompts, threads)) - else: - instructions = list(zip(prompts, [None]*len(prompts))) - - full_prompts, outputs = base_inference.run_inference( - instructions=instructions, - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - thread_preamble=thread_preamble, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - - # Remove some boilerplate added by instruction-tuned models w/out finetuning. - outputs = [o.replace("Here is the email:\n", "") for o in outputs] - outputs = [re.sub(r'SUBJECT:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'Subject:.*\n', "", o) for o in outputs] - outputs = [re.sub(r'E-MAIL CONTENT:.*\n', "", o) for o in outputs] - for j, prompt in enumerate(prompts): - # We clean up the strings for the BLEU and ROUGE scores. - punc_table = str.maketrans({key: None for key in string.punctuation}) - golden = [" ".join(x.translate(punc_table).lower().split()) for x in golden_responses[j]] - candidate = " ".join(outputs[j].translate(punc_table).lower().split()) - - rouge_score = rouge(outputs[j], golden_responses[j]) - bleu_score = np.mean([bleu([candidate], [golden]) for bleu in [bleu1, bleu2, bleu3, bleu4]]) - rouge_score = rouge(candidate, golden) - if prompt not in prompt_scores.keys(): - prompt_scores[prompt] = {"prompt": prompt, "full_prompt": full_prompts[j], - "golden" : golden_responses[j], "output": [outputs[j]], - "BLEU": [bleu_score.item()]} - for score, value in rouge_score.items(): - prompt_scores[prompt][score] = [value.item()] - else: - prompt_scores[prompt]["output"].append(outputs[j]) - prompt_scores[prompt]["BLEU"].append(bleu_score.item()) - for score, value in rouge_score.items(): - prompt_scores[prompt][score].append(value.item()) - - print("\n-----------\n", "PROMPT:\n", prompt, "\n\nOUTPUT:\n", outputs[j], "\n\nBLEU SCORE:\n", bleu_score, "\n\nROUGE SCORE:\n", rouge_score) - - - means = {} - mins = {} - score_names = [k for k in prompt_scores.values().__iter__().__next__().keys() if 'BLEU' in k or 'rouge' in k] - - for k in score_names: - means[k] = np.mean([v for scores in prompt_scores.values() for v in scores[k] ]) - mins[k] = np.min([v for scores in prompt_scores.values() for v in scores[k] ]) - - # To compute the MAUVE score, we need equal-length flat arrays of - # outputs and goldens. If we have multiple outputs per prompt, we - # output them all, with the same golden prompt. - # TODO: not sure if it would be better to randomly sample from the - # outputs in this case. - # TODO: consider handling the case where there are also multiple golden - # queries per output. (We don't use this for anything now). - flattened_golden = [] - flattened_outputs = [] - for prompt_info in prompt_scores.values(): - flattened_golden += ([prompt_info["golden"][0]])*len(prompt_info['output']) - flattened_outputs += prompt_info['output'] - mauve_score = mauve.compute(predictions=flattened_outputs, references=flattened_golden) - print("MAUVE score", mauve_score) - means["MAUVE"] = mauve_score.mauve - print("Mean scores across all prompts: ", {f" {k}: {v}" for k, v in means.items()}) - - - # Optionally, update wandb run with eval scores - if args.use_thread: - setting_str = "THREAD-" - elif args.use_rag: - setting_str = "RAG-" - else: - setting_str = "" - - if args.wandb_run_id: - with wandb.init(id=args.wandb_run_id, resume=True): - wandb.log({f"EVAL/{k}-{setting_str}mean": v for k, v in means.items()}) - wandb.log({f"EVAL/{k}-{setting_str}min": v for k, v in mins.items()}) - else: - print({f"EVAL/{k}-{setting_str}mean": v for k, v in means.items()}) - print({f"EVAL/{k}-{setting_str}min": v for k, v in mins.items()}) - - with open(os.path.join(args.model, f"{setting_str}eval_responses.txt"), 'w') as f: - json.dump(prompt_scores, f, ensure_ascii=False, indent=4) - - with open(os.path.join(args.model, f"{setting_str}eval_summary.txt"), 'w') as f: - json.dump({"means": means, "mins": mins}, f, ensure_ascii=False, indent=4) - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/gui_inference.py b/src/panza/evaluation/gui_inference.py deleted file mode 100644 index 24d0b90..0000000 --- a/src/panza/evaluation/gui_inference.py +++ /dev/null @@ -1,89 +0,0 @@ -import argparse -import os -import sys - -import gradio as gr -import torch - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -sys.path.pop(0) - - -def get_execute(model, tokenizer, system_preamble, user_preamble, rag_preamble, db, args): - - def execute(prompt): - prompts, outputs = base_inference.run_inference( - instructions=[(prompt, None)], - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - thread_preamble=None, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - print("Prompt\n", prompts[0]) - print("Output\n", outputs[0]) - yield outputs[0] - - return execute - - -def main(): - parser = base_inference.get_base_inference_args_parser() - parser.add_argument("--host", type=str, default=None) - parser.add_argument("--port", type=int, default=8001) - args = parser.parse_args() - - print("Running inference with args:", args) - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - print("Loading model ", args.model) - model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble - ) - - with gr.Blocks() as panza: - gr.Markdown("# Panza\n") - inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER") - outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model") - inputbox.submit( - get_execute( - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - db=db if args.use_rag else None, - args=args, - ), - [inputbox], - [outputbox], - ) - - panza.queue().launch(server_name=args.host, server_port=args.port, share=True) - - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/ollama_inference.py b/src/panza/evaluation/ollama_inference.py deleted file mode 100644 index a0d7a84..0000000 --- a/src/panza/evaluation/ollama_inference.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import sys - -import ollama -import torch - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag -from panza.utils.documents import Email - -sys.path.pop(0) - - -def get_response_stream(prompt: str, model: str): - stream = ollama.chat( - model=model, - messages=[{"role": "user", "content": prompt}], - stream=True, - ) - - return stream - - -def print_response_stream(stream): - for chunk in stream: - print(chunk["message"]["content"], end="", flush=True) - - - -def main(): - parser = base_inference.get_base_inference_args_parser() - args = parser.parse_args() - - print("Running inference with args:", args) - - if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - - - if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble - ) - - while True: - instruction = input("Enter another request (or 'quit' to exit): ") - - if instruction.lower() == "quit": - print("Exiting...") - break - - relevant_emails = [] - if args.use_rag: - assert db is not None, "RAG requires a database to be provided." - re = db._similarity_search_with_relevance_scores(instruction, k=args.rag_num_emails) - relevant_emails = [ - Email.deserialize(r[0].metadata["serialized_email"]) - for r in re - if r[1] >= args.rag_relevance_threshold - ] - - prompt = prompting.create_prompt( - instruction, - system_preamble, - user_preamble, - rag_preamble, - relevant_emails, - ) - - print("Running with prompt:", prompt) - - args.model = "llama3.1" - stream = get_response_stream(prompt, args.model) - print_response_stream(stream) - - -if __name__ == "__main__": - main() diff --git a/src/panza/evaluation/ollama_service_inference.py b/src/panza/evaluation/ollama_service_inference.py deleted file mode 100644 index 2831536..0000000 --- a/src/panza/evaluation/ollama_service_inference.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import sys -from typing import Annotated - -from fastapi import FastAPI, HTTPException, Header -from fastapi.responses import StreamingResponse -import uvicorn -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from dotenv import load_dotenv - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting -from panza.evaluation import ollama_inference - -class Request(BaseModel): - text: str - -sys.path.pop(0) - -app = FastAPI() - -origins = [ - "https://mail.google.com", -] - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Load environment variables from the .env file -load_dotenv() -valid_api_keys = os.getenv("API_KEYS").split(",") - -parser = base_inference.get_base_inference_args_parser() -args = parser.parse_args() - -print("Running inference with args:", args) - -system_preamble, user_preamble, rag_preamble, _ = prompting.load_all_preambles( - args.system_preamble, args.user_preamble, args.rag_preamble, args.thread_preamble -) - -def predict(user_input): - relevant_emails = [] - prompt = prompting.create_prompt( - user_input, - system_preamble, - user_preamble, - rag_preamble, - relevant_emails, - ) - return ollama_inference.get_response_stream(prompt, args.model) - -def streamer(stream): - for chunk in stream: - yield chunk["message"]["content"] - -@app.options('/generate') -def options(): - return {"methods": ["POST"]} - -@app.post('/generate') -def generate_text(request: Request, x_api_key: Annotated[str | None, Header()] = None): - if x_api_key not in valid_api_keys: - raise HTTPException(status_code=401, detail="Invalid API key.") - stream = predict(request.text) - return StreamingResponse(streamer(stream), media_type='text/event-stream') - -if __name__ == '__main__': - uvicorn.run(app, host='0.0.0.0', port=5001) \ No newline at end of file diff --git a/src/panza/evaluation/service_inference.py b/src/panza/evaluation/service_inference.py deleted file mode 100644 index 9dc452d..0000000 --- a/src/panza/evaluation/service_inference.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -import sys -from typing import Annotated - -import torch - -from fastapi import FastAPI, HTTPException, Header -import uvicorn -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from dotenv import load_dotenv - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.evaluation import base_inference -from panza.utils import prompting, rag - -class Request(BaseModel): - text: str - -class Response(BaseModel): - generated_text: str - -sys.path.pop(0) - -app = FastAPI() - -origins = [ - "https://mail.google.com", -] - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Load environment variables from the .env file -load_dotenv() -valid_api_keys = os.getenv("API_KEYS").split(",") - -parser = base_inference.get_base_inference_args_parser() -args = parser.parse_args() - -print("Running inference with args:", args) - -if args.nthreads is not None: - torch.set_num_threads(args.nthreads) - -print("Loading model ", args.model) -model, tokenizer = base_inference.load_model_and_tokenizer(args.model, args.device, args.dtype, load_in_4bit=args.load_in_4bit) - -if args.use_rag: - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - -#system_preamble, user_preamble, rag_preamble = prompting.load_all_preambles( -# args.system_preamble, args.user_preamble, args.rag_preamble -#) - -system_preamble, user_preamble, rag_preamble = ("", "", "") - -def predict(user_input): - prompts, outputs = base_inference.run_inference( - instructions=[user_input], - model=model, - tokenizer=tokenizer, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - rag_relevance_threshold=args.rag_relevance_threshold, - rag_num_emails=args.rag_num_emails, - use_rag=args.use_rag, - db=db if args.use_rag else None, - max_new_tokens=args.max_new_tokens, - best=args.best, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - device=args.device, - ) - - print("Processed input:", prompts[0]) - print("Generated email", outputs[0]) - - return outputs[0] - -@app.options('/generate') -def options(): - return {"methods": ["POST"]} - -@app.post('/generate') -def generate_text(request: Request, x_api_key: Annotated[str | None, Header()] = None): - if x_api_key not in valid_api_keys: - raise HTTPException(status_code=401, detail="Invalid API key, must be one of: " + str(valid_api_keys)) - generated_text = predict(request.text) - return {"generated_text": generated_text} - - -if __name__ == '__main__': - uvicorn.run(app, host='0.0.0.0', port=5000) diff --git a/src/panza3/finetuning/preprocessing.py b/src/panza/finetuning/preprocessing.py similarity index 97% rename from src/panza3/finetuning/preprocessing.py rename to src/panza/finetuning/preprocessing.py index 0f17aeb..26e499b 100644 --- a/src/panza3/finetuning/preprocessing.py +++ b/src/panza/finetuning/preprocessing.py @@ -5,7 +5,7 @@ from omegaconf import OmegaConf from transformers import AutoConfig, AutoTokenizer -from panza3.entities import EmailInstruction +from panza.entities import EmailInstruction PREPROCESSING_CONFIG_FILE = os.environ.get("PANZA_PREPROCESSING_CONFIG") if PREPROCESSING_CONFIG_FILE: diff --git a/src/panza3/finetuning/train.py b/src/panza/finetuning/train.py similarity index 99% rename from src/panza3/finetuning/train.py rename to src/panza/finetuning/train.py index c95a0e0..8c7b9e5 100644 --- a/src/panza3/finetuning/train.py +++ b/src/panza/finetuning/train.py @@ -68,7 +68,7 @@ import hydra from omegaconf import DictConfig, OmegaConf -from panza3 import PanzaWriter # The import also loads custom Hydra resolvers +from panza import PanzaWriter # The import also loads custom Hydra resolvers log = logging.getLogger(__name__) @@ -271,9 +271,9 @@ def build_composer_peft_model( bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", ) - elif weight_bias_dtype == 'bf16': - compute_dtype = torch.bfloat16 - quant_config = None + elif weight_bias_dtype == "bf16": + compute_dtype = torch.bfloat16 + quant_config = None else: assert weight_bias_dtype == "fp32" compute_dtype = torch.float32 diff --git a/src/panza3/interface/__init__.py b/src/panza/interface/__init__.py similarity index 100% rename from src/panza3/interface/__init__.py rename to src/panza/interface/__init__.py diff --git a/src/panza3/interface/cli.py b/src/panza/interface/cli.py similarity index 79% rename from src/panza3/interface/cli.py rename to src/panza/interface/cli.py index bcf03a3..43e5656 100644 --- a/src/panza3/interface/cli.py +++ b/src/panza/interface/cli.py @@ -1,5 +1,5 @@ -from panza3.entities.instruction import EmailInstruction, Instruction -from panza3.writer import PanzaWriter +from panza.entities.instruction import EmailInstruction, Instruction +from panza.writer import PanzaWriter class PanzaCLI: diff --git a/src/panza3/interface/gui.py b/src/panza/interface/gui.py similarity index 89% rename from src/panza3/interface/gui.py rename to src/panza/interface/gui.py index f48d97d..185c278 100644 --- a/src/panza3/interface/gui.py +++ b/src/panza/interface/gui.py @@ -1,5 +1,5 @@ -from panza3.entities.instruction import EmailInstruction, Instruction -from panza3.writer import PanzaWriter +from panza.entities.instruction import EmailInstruction, Instruction +from panza.writer import PanzaWriter import gradio as gr diff --git a/src/panza3/interface/gui_b.py b/src/panza/interface/gui_b.py similarity index 81% rename from src/panza3/interface/gui_b.py rename to src/panza/interface/gui_b.py index c42e019..62ab1db 100644 --- a/src/panza3/interface/gui_b.py +++ b/src/panza/interface/gui_b.py @@ -1,5 +1,5 @@ -from panza3.entities.instruction import EmailInstruction, Instruction -from panza3.writer import PanzaWriter +from panza.entities.instruction import EmailInstruction, Instruction +from panza.writer import PanzaWriter import gradio as gr @@ -22,10 +22,10 @@ def get_execute(self): def execute(input): instruction: Instruction = EmailInstruction(input) stream = self.writer.run(instruction, stream=False) - #output = "" - #for chunk in stream: + # output = "" + # for chunk in stream: # output += chunk - #yield stream.end() + # yield stream.end() yield stream return execute diff --git a/src/panza3/interface/json.py b/src/panza/interface/json.py similarity index 98% rename from src/panza3/interface/json.py rename to src/panza/interface/json.py index 3023ee6..f6d190a 100644 --- a/src/panza3/interface/json.py +++ b/src/panza/interface/json.py @@ -1,5 +1,5 @@ -from panza3.entities.instruction import EmailInstruction -from panza3.writer import PanzaWriter +from panza.entities.instruction import EmailInstruction +from panza.writer import PanzaWriter import json import numpy as np diff --git a/src/panza3/interface/web.py b/src/panza/interface/web.py similarity index 95% rename from src/panza3/interface/web.py rename to src/panza/interface/web.py index 59a5786..f925a70 100644 --- a/src/panza3/interface/web.py +++ b/src/panza/interface/web.py @@ -4,8 +4,8 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi import FastAPI, HTTPException, Header from fastapi.responses import StreamingResponse -from panza3.entities.instruction import EmailInstruction, Instruction -from panza3.writer import PanzaWriter +from panza.entities.instruction import EmailInstruction, Instruction +from panza.writer import PanzaWriter import uvicorn from pydantic import BaseModel from dotenv import load_dotenv diff --git a/src/panza3/llm/__init__.py b/src/panza/llm/__init__.py similarity index 100% rename from src/panza3/llm/__init__.py rename to src/panza/llm/__init__.py diff --git a/src/panza3/llm/base.py b/src/panza/llm/base.py similarity index 100% rename from src/panza3/llm/base.py rename to src/panza/llm/base.py diff --git a/src/panza3/llm/local.py b/src/panza/llm/local.py similarity index 100% rename from src/panza3/llm/local.py rename to src/panza/llm/local.py diff --git a/src/panza3/llm/ollama.py b/src/panza/llm/ollama.py similarity index 100% rename from src/panza3/llm/ollama.py rename to src/panza/llm/ollama.py diff --git a/src/panza3/prompting/__init__.py b/src/panza/prompting/__init__.py similarity index 100% rename from src/panza3/prompting/__init__.py rename to src/panza/prompting/__init__.py diff --git a/src/panza3/prompting/base.py b/src/panza/prompting/base.py similarity index 100% rename from src/panza3/prompting/base.py rename to src/panza/prompting/base.py diff --git a/src/panza3/prompting/email_prompting.py b/src/panza/prompting/email_prompting.py similarity index 100% rename from src/panza3/prompting/email_prompting.py rename to src/panza/prompting/email_prompting.py diff --git a/src/panza3/prompting/summarization_prompting.py b/src/panza/prompting/summarization_prompting.py similarity index 100% rename from src/panza3/prompting/summarization_prompting.py rename to src/panza/prompting/summarization_prompting.py diff --git a/src/panza3/prompting/utils.py b/src/panza/prompting/utils.py similarity index 100% rename from src/panza3/prompting/utils.py rename to src/panza/prompting/utils.py diff --git a/src/panza3/retriever/__init__.py b/src/panza/retriever/__init__.py similarity index 100% rename from src/panza3/retriever/__init__.py rename to src/panza/retriever/__init__.py diff --git a/src/panza3/retriever/base.py b/src/panza/retriever/base.py similarity index 100% rename from src/panza3/retriever/base.py rename to src/panza/retriever/base.py diff --git a/src/panza3/retriever/faiss.py b/src/panza/retriever/faiss.py similarity index 100% rename from src/panza3/retriever/faiss.py rename to src/panza/retriever/faiss.py diff --git a/src/panza3/retriever/none.py b/src/panza/retriever/none.py similarity index 100% rename from src/panza3/retriever/none.py rename to src/panza/retriever/none.py diff --git a/src/panza/utils/prompting.py b/src/panza/utils/prompting.py index 38f4fbe..ed3ec9f 100644 --- a/src/panza/utils/prompting.py +++ b/src/panza/utils/prompting.py @@ -18,6 +18,136 @@ PHI3_RESPONSE_END_WRAPPER = "<|end|>" +def create_prompt( + user_input: Text, + system_preamble: Text, + user_preamble: Text, + rag_preamble: Optional[Text] = None, + relevant_emails: Optional[List[Email]] = None, + thread_preamble: Optional[Text] = None, + thread_emails: Optional[List[Text]] = None, +) -> Text: + + if relevant_emails: + assert rag_preamble, "RAG preamble format must be provided if similar emails are provided." + rag_prompt = _create_rag_preamble_from_emails(rag_preamble, relevant_emails).strip() + else: + rag_prompt = "" + + if thread_emails: + assert thread_preamble, "Thread preamble format must be provided if thread is provided." + thread_prompt = _create_threading_preamble(thread_preamble, thread_emails).strip() + else: + thread_prompt = "" + + system_preamble = system_preamble.strip() + user_preamble = user_preamble.strip() + + prompt = "" + if system_preamble: + prompt += f"{system_preamble}\n\n" + if user_preamble: + prompt += f"{user_preamble}\n\n" + if rag_prompt: + prompt += f"{rag_prompt}\n\n" + if thread_prompt: + prompt += f"{thread_prompt}\n\n" + prompt += f"Instruction: {user_input}" + + return prompt + + +def _create_rag_preamble_from_emails(rag_preamble_format: Text, emails: List[Email]) -> Text: + rag_context = _create_rag_context_from_emails(emails) + return rag_preamble_format.format(rag_context=rag_context) + + +def _create_rag_context_from_emails(emails: List[Email]) -> Text: + """Creates a RAG context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + SUBJECT: + E-MAIL CONTENT: + + + --- + + SUBJECT: + E-MAIL CONTENT: + + + --- + ... + """ + + rag_context = "" + for email in emails: + rag_context += ( + # f"SUBJECT: {email.metadata['subject']}\n" # TODO(armand): Handle subject metadata + f"E-MAIL CONTENT:\n{email.page_content}\n\n---\n\n" + ) + + return rag_context + + +def _create_threading_preamble(threading_preamble_format: Text, thread: List[Text]) -> Text: + threading_context = _create_threading_context(thread) + return threading_preamble_format.format(threading_context=threading_context) + + +def _create_threading_context(thread: List[Text]) -> Text: + """Creates a threading context from a list of relevant e-mails. + + The e-mails are formatted as follows: + + + + --- + + + + --- + ... + """ + + threading_context = "" + for email in thread: + threading_context += f"{email}\n\n---\n\n" + + return threading_context + + +def load_preamble(path): + with open(path, "r") as file: + return file.read().strip() + + +# The user preamble must be edited by the user in order to work as intended. +# Here, we perform additional checks to make sure that that happened; if not, +# We issue a warning to the user. +def load_user_preamble(path): + with open(path, "r") as file: + lines = [l for l in file.readlines() if not l.strip().startswith("#")] + print(lines) + preamble = "".join(lines) + if "CHANGE ME" in preamble: + print( + "*" * 66 + + "\n* WARNING: User prompt preamble not customized. *\n* Please edit the preamble at prompt_preambles/user_preamble.txt *\n" + + "*" * 66 + ) + return preamble + + +def load_all_preambles(system_preamble, user_preamble, rag_preamble, thread_preamble): + system_preamble = load_preamble(system_preamble) if system_preamble else "" + user_preamble = load_user_preamble(user_preamble) if user_preamble else "" + rag_preamble = load_preamble(rag_preamble) if rag_preamble else "" + thread_preamble = load_preamble(thread_preamble) if thread_preamble else "" + return system_preamble, user_preamble, rag_preamble, thread_preamble + + def get_model_special_tokens(model_name): model_name = model_name.lower() if "llama" in model_name: diff --git a/src/panza/utils/rag.py b/src/panza/utils/rag.py index b33e9ae..5653ed9 100644 --- a/src/panza/utils/rag.py +++ b/src/panza/utils/rag.py @@ -1,3 +1,37 @@ from typing import List +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + + +def get_embeddings_model(model_name) -> Embeddings: + embeddings_model = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs={"device": "cpu"}, + encode_kwargs={"normalize_embeddings": False}, + ) + return embeddings_model + + +def create_vector_db(docs: List[Document], embeddings_model: Embeddings) -> VectorStore: + db = FAISS.from_documents(docs, embeddings_model) + return db + + +def load_vector_db_from_disk( + folder_path: str, index_name: str, embeddings_model: Embeddings +) -> VectorStore: + try: + db = FAISS.load_local( + folder_path=folder_path, + embeddings=embeddings_model, + index_name=index_name, + allow_dangerous_deserialization=True, # Allows pickle deserialization + ) + print("Faiss index loaded ") + return db + except Exception as e: + print("Fiass index loading failed \n", e) diff --git a/src/panza3/writer.py b/src/panza/writer.py similarity index 100% rename from src/panza3/writer.py rename to src/panza/writer.py diff --git a/src/panza3/__init__.py b/src/panza3/__init__.py deleted file mode 100644 index 100281c..0000000 --- a/src/panza3/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from omegaconf import OmegaConf - -from .prompting.utils import load_preamble, load_user_preamble - -OmegaConf.register_new_resolver("load_preamble", load_preamble) -OmegaConf.register_new_resolver("load_user_preamble", load_user_preamble) - -from .writer import PanzaWriter - -__all__ = ["PanzaWriter"] diff --git a/src/panza3/data_preparation/extract_emails.py b/src/panza3/data_preparation/extract_emails.py deleted file mode 100644 index e9ead5e..0000000 --- a/src/panza3/data_preparation/extract_emails.py +++ /dev/null @@ -1,208 +0,0 @@ -import json -import mailbox -import re -from email.utils import parsedate_to_datetime -from email.message import Message -from mailbox import mboxMessage -from os import makedirs -from os.path import join, dirname - -import langdetect - -CLEAN_EMAILS = [] -DISCARDED_EMAILS = { - "non_english": [], - "forwarded": [], - "short": [], - "empty": [], - "cant_decode_utf8": [], -} - -SHORT_EMAIL_THRESHOLD = 10 # words - -FORWARDED_MESSAGE_TAG = "---------- Forwarded message ---------" - - -def extract_only_plain_text(msg_part): - if msg_part.get_content_type() == "text/plain": - body = msg_part.get_payload(decode=True) - plain_text = body.decode() # assuming the text is in UTF-8, handle other cases later - return plain_text - - -def skip_forwarded_messages(plain_text): - if FORWARDED_MESSAGE_TAG in plain_text: - DISCARDED_EMAILS["forwarded"].append(plain_text) - return "" - else: - return plain_text - - -def remove_date_time(email_body): - # Regular expression pattern to match lines starting with "On " and ending with "> wrote: " - # The pattern uses non-greedy matching (.*?) to find the shortest match that satisfies the condition - pattern = re.compile(r"(^On.*wrote.*)|(^Am.*schrieb.*)", re.MULTILINE | re.DOTALL) - - match = pattern.search(email_body) - if match: - return (email_body[: match.start()] + email_body[match.end() :]).strip() - else: - return email_body - - -def remove_lines_starting_with_gt(text): - lines = text.split("\n") - filtered_lines = [ - line for line in lines if not line.startswith(">") - ] # Filter out lines starting with "> " - return "\n".join(filtered_lines) - - -def count_words(s): - return len(s.split()) - - -def extract_by_quote_level(text): - # Split the text into lines - lines = text.split("\n") - - # Dictionary to store lines by quote level - grouped_lines = {} - - for line in lines: - # Count the number of '>' at the start of the line - quote_level = len(re.match(r"^>*", line).group()) - - # Remove leading '>' and spaces - clean_line = re.sub(r"^>*\s*", "", line) - - # Add the clean line to the appropriate group - if quote_level not in grouped_lines: - grouped_lines[quote_level] = [] - grouped_lines[quote_level].append(clean_line) - - return grouped_lines - - -def filter_message(msg): - try: - plain_text = extract_only_plain_text(msg) - except: - DISCARDED_EMAILS["cant_decode_utf8"].append(msg) - return None - - if plain_text is None: - return None - - plain_text = skip_forwarded_messages(plain_text) - email_with_thread = extract_by_quote_level(plain_text) - email_with_thread = ["\n".join(an_email).strip() for an_email in email_with_thread.values()] - - # remove "On ... wrote:" lines - email_with_thread = [remove_date_time(an_email) for an_email in email_with_thread] - - main_email = email_with_thread.pop(0) - email_with_thread.reverse() # chronological order - - # check length before detecting language - if count_words(main_email) < SHORT_EMAIL_THRESHOLD: - DISCARDED_EMAILS["short"].append(plain_text) - return None - try: - if langdetect.detect(main_email) != "en": - DISCARDED_EMAILS["non_english"].append(plain_text) - return None - except: - # failed to detect language - DISCARDED_EMAILS["non_english"].append(plain_text) - return None - - if main_email.isspace() or main_email == "": - DISCARDED_EMAILS["empty"].append(plain_text) - return None - - return (main_email.strip(), [an_email.strip() for an_email in email_with_thread]) - - -def extract_emails(mailbox_path, output_path, email_addresses, save_discarded_emails_path): - - MBOX_PATH = mailbox_path - EMAIL = email_addresses - - mbox = mailbox.mbox(MBOX_PATH) - n_emails = len(mbox) - for i, message in enumerate(mbox): - print(f"--> processing {i}/{n_emails} <--") - # Filter messages sent from your email address - if message["from"] and any(email in message["from"] for email in EMAIL): - if message["Date"]: - date = parsedate_to_datetime(message["Date"]).isoformat() - else: - print("Date was not found in the email. Skipping.") - continue - if message.is_multipart(): - for part in message.walk(): - filtered_msg = filter_message(part) - if filtered_msg is not None: - print(filtered_msg) - main_email, thread = filtered_msg - CLEAN_EMAILS.append( - { - "email": main_email, - "thread": thread, - "subject": message["Subject"], - "date": date, - } - ) - else: - filtered_msg = filter_message(message) - if filtered_msg is not None: - print(filtered_msg) - main_email, thread = filtered_msg - CLEAN_EMAILS.append( - { - "email": main_email, - "thread": thread, - "subject": message["Subject"], - "date": date, - } - ) - - print(f"\n---> [Cleaning stats] <---") - print(f"# clean emails = {len(CLEAN_EMAILS)}") - print( - f"# discarded emails:" - f"\n\t non_english = {len(DISCARDED_EMAILS['non_english'])}" - f"\n\t empty = {len(DISCARDED_EMAILS['empty'])}" - f"\n\t short (less than {SHORT_EMAIL_THRESHOLD} words)= {len(DISCARDED_EMAILS['short'])}" - f"\n\t forwarded = {len(DISCARDED_EMAILS['forwarded'])}" - f"\n\t cant_decode_utf8 = {len(DISCARDED_EMAILS['cant_decode_utf8'])}" - ) - - first_email = EMAIL[0] - username = first_email[: first_email.find("@")] - - makedirs(dirname(output_path), exist_ok=True) - - # Save clean emails - with open(join(output_path), "w", encoding="utf-8") as f: - for item in CLEAN_EMAILS: - json_record = json.dumps(item) - f.write(json_record + "\n") - - # Save discarded emails - if save_discarded_emails_path and save_discarded_emails_path != "": - print(f"\n---> Processing Discarded Emails <---") - makedirs(save_discarded_emails_path, exist_ok=True) - for k, v in DISCARDED_EMAILS.items(): - print(f"--> processing {k} emails <--") - output_path = join(save_discarded_emails_path, f"{username}_discarded_{k}.jsonl") - with open(output_path, "w", encoding="utf-8") as f: - discarded_emails = len(v) - for i, item in enumerate(v): - print("\n\n\n\n\===========================") - if type(item) is Message or type(item) is mboxMessage: - item = item.get_payload() - print(f"--> processing {i}/{discarded_emails} <--") - json_record = json.dumps(item) - f.write(json_record + "\n") diff --git a/src/panza3/data_preparation/prepare_raft_emails.py b/src/panza3/data_preparation/prepare_raft_emails.py deleted file mode 100644 index 429ac30..0000000 --- a/src/panza3/data_preparation/prepare_raft_emails.py +++ /dev/null @@ -1,92 +0,0 @@ -import argparse -import gc -import json -import os -import sys -import time -from typing import Dict, List, Text - -import torch -from tqdm import tqdm - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.utils import rag -from panza.utils.documents import Email - -sys.path.pop(0) - - -def retrieve_similar_emails(batch, db, num_emails): - emails = [] - for email in batch: - try: - relevant_emails = db._similarity_search_with_relevance_scores( - email["email"], k=num_emails - ) - except Exception as e: - print(f"Error in RAG search: {e}") - relevant_emails = [] - return relevant_emails - - relevant_emails = [ - {"serialized_email": r[0].metadata["serialized_email"], "score": r[1]} - for r in relevant_emails - if r[0].page_content not in email["email"] - ] - email["relevant_emails"] = relevant_emails - emails.append(email) - - return emails - - -def main(): - parser = argparse.ArgumentParser( - description="Get similar emails for Retrieval Augmented Fine Tuning (RAFT)" - ) - parser.add_argument("--path-to-emails", help="Path to the cleaned emails") - parser.add_argument( - "--embedding-model", type=str, default="sentence-transformers/all-mpnet-base-v2" - ) - parser.add_argument("--db-path", type=str, default=None) - parser.add_argument("--index-name", type=str, default=None) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--rag-num-emails", type=int, default=7) - args = parser.parse_args() - - assert args.path_to_emails.endswith( - ".jsonl" - ), f"Expecting a .jsonl file, but given = {args.path_to_emails}" - - print(f"--> Reading emails from: {args.path_to_emails}") - - # Read emails - with open(args.path_to_emails, "r") as f: - lines = f.readlines() - json_lines = [json.loads(line.strip(",")) for line in lines] - print(f"--> # emails = {len(json_lines)}") - - embeddings_model = rag.get_embeddings_model(args.embedding_model) - db = rag.load_vector_db_from_disk(args.db_path, args.index_name, embeddings_model) - - path_for_outputs = args.path_to_emails.rsplit(".jsonl", 1)[0] + "_raft.jsonl" - num_processed_emails = 0 - start_time = time.time() - with open(path_for_outputs, "w") as f: - for i in tqdm(range(0, len(json_lines), args.batch_size)): - # TODO(armand): Fix this print for batched inference - print(f"--> Processing batch {i}/{len(json_lines)}") - batch = json_lines[i : i + args.batch_size] - emails = retrieve_similar_emails(batch, db, args.rag_num_emails) - num_processed_emails += len(emails) - - for item in emails: - f.write(json.dumps(item)) - f.write("\n") - - elapsed_time = time.time() - start_time - print(f"{elapsed_time:.2f} seconds to process {len(json_lines)} emails.") - - -if __name__ == "__main__": - main() diff --git a/src/panza3/data_preparation/split_data.py b/src/panza3/data_preparation/split_data.py deleted file mode 100644 index a487dcc..0000000 --- a/src/panza3/data_preparation/split_data.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse -import json -import random -from datetime import datetime -from os import makedirs -from os.path import join - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--data-path", type=str, default=None) - parser.add_argument("--output-data-dir", type=str, default=None) - parser.add_argument("--train-ratio", type=float, default=0.8) - parser.add_argument("--split-type", type=str, default="random") - parser.add_argument("--seed", type=int, default=42) - args = parser.parse_args() - - makedirs(args.output_data_dir, exist_ok=True) - - with open(args.data_path, "r") as f: - data = f.readlines() - - if args.split_type == "random": - random.seed(args.seed) - random.shuffle(data) - elif args.split_type == "chronological": - data = sorted(data, key=lambda x: datetime.fromisoformat(json.loads(x)["date"])) - else: - raise ValueError("Invalid split type.") - - train_size = int(len(data) * args.train_ratio) - - with open(join(args.output_data_dir, "train.jsonl"), "w") as f: - for i in range(train_size): - f.write(data[i]) - - with open(join(args.output_data_dir, "test.jsonl"), "w") as f: - for i in range(train_size, len(data)): - f.write(data[i]) - - -if __name__ == "__main__": - main() diff --git a/src/panza3/data_preparation/summarize_emails.py b/src/panza3/data_preparation/summarize_emails.py deleted file mode 100644 index ec92270..0000000 --- a/src/panza3/data_preparation/summarize_emails.py +++ /dev/null @@ -1,202 +0,0 @@ -import argparse -import gc -import json -import os -import sys -import time -from typing import Dict, List, Text - -import torch -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) - -from panza.utils import prompting - -sys.path.pop(0) - -MDL = os.environ.get("PANZA_GENERATIVE_MODEL") -TEMP = 0.7 -TOP_P = 0.7 -TOP_K = 50 - - -class LLMSummarizer: - def __init__(self, model, dtype, temperature, top_k, top_p, summarization_prompt, load_in_4bit) -> None: - self.device = "cuda" - - if load_in_4bit: - quant_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=dtype, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4', - ) - else: - quant_config = None - - self.model = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=dtype, device_map=self.device, quantization_config=quant_config, trust_remote_code=True - ) - self.tokenizer = AutoTokenizer.from_pretrained( - model, model_max_length=self.model.config.max_position_embeddings, trust_remote_code=True - ) - self.tokenizer.padding_side = "left" - self.tokenizer.pad_token = self.tokenizer.eos_token - self.summarization_prompt = summarization_prompt - - _, self.prompt_end_wrapper, _, self.response_end_wrapper = ( - prompting.get_model_special_tokens(self.model.name_or_path) - ) - - # Save sampling parameters - self.temperature = temperature - self.top_k = top_k - self.top_p = top_p - - def prepare_batch_for_inference(self, emails: List[Dict]) -> List[Text]: - batch_with_prompt = [] - for item in emails: - prompt_with_email = self.summarization_prompt.format(email=item["email"]) - batch_with_prompt.append([{"role": "user", "content": prompt_with_email}]) - return batch_with_prompt - - def run_inference(self, emails: List[Dict]) -> List[Dict]: - gc.collect() - torch.cuda.empty_cache() - batch = self.prepare_batch_for_inference(emails) - - model_inputs = self.tokenizer.apply_chat_template( - batch, - return_tensors="pt", - add_generation_prompt=True, - padding=True, - truncation=True, - return_dict=True, - ) - model_inputs = model_inputs.to(self.device) - - generated_ids = self.model.generate( - **model_inputs, - max_new_tokens=1024, - do_sample=True, - temperature=self.temperature, - top_k=self.top_k, - top_p=self.top_p, - pad_token_id=self.tokenizer.pad_token_id, - ) - - outputs = self.tokenizer.batch_decode(generated_ids) - - # Extract generated text - summaries = [] - for output in outputs: - output = output.split(self.prompt_end_wrapper)[-1] - output = output.split(self.response_end_wrapper)[0] - output = output.strip() - summaries.append(output) - - return summaries - - -def generate_synthetic_instructions(emails: List[Dict], summarizer: LLMSummarizer): - summarized_emails = [] - - summaries = summarizer.run_inference(emails) - - for j, generated_text in enumerate(summaries): - - # Check if the outputs are valid - keyword = "Instruction: " - if generated_text.count(keyword) != 1: - print( - f"[WARNING] Skipping this sample:\n{generated_text}\n-----> " - f"[REASON] it contains none or multiple instances of the keyword = {keyword}, " - "but we expect exactly one" - ) - continue - - instruction = generated_text.split(keyword, 1)[1] - summarized_emails.append( - { - "email": emails[j]["email"], - "subject": emails[j]["subject"], - "summary": instruction, - "thread": emails[j]["thread"], - "date": emails[j]["date"], - } - ) - - return summarized_emails - - -def main(): - parser = argparse.ArgumentParser( - description="Transform emails into dataset for PANZA finetuning" - ) - parser.add_argument("--path-to-emails", help="Path to the cleaned emails") - parser.add_argument("--prompt-file", help="A path to file with prompt text") - parser.add_argument("--batch-size", type=int, help="Inference batch size") - parser.add_argument("--load-in-4bit", default=False, action='store_true', help="Wheather to load the model in 4bit precision (BNB)") - parser.add_argument("--fp32", default=False, action='store_true', help="Whether to use FP32 precision for computation") - args = parser.parse_args() - - assert args.path_to_emails.endswith( - ".jsonl" - ), f"Expecting a .jsonl file, but given = {args.path_to_emails}" - - assert os.path.exists( - args.prompt_file - ), f"Prompt file does not exist. Given path = {args.prompt_file}" - with open(args.prompt_file, "r") as file: - summarization_prompt = file.read() - - print(f"--> Reading emails from: {args.path_to_emails}") - print(f"--> Processing with batch_size {args.batch_size} and prompt = {summarization_prompt}") - print( - f"--> params for sampling:" - f"\t model = {MDL}" - f"\t temperature = {TEMP}" - f"\t top_p = {TOP_P}" - ) - - # Read emails - with open(args.path_to_emails, "r") as f: - lines = f.readlines() - json_lines = [json.loads(line.strip(',')) for line in lines] - print(f"--> # emails = {len(json_lines)}") - - summarizer = LLMSummarizer( - model=MDL, - dtype=torch.float32 if args.fp32 else torch.bfloat16, - temperature=TEMP, - top_p=TOP_P, - top_k=TOP_K, - summarization_prompt=summarization_prompt, - load_in_4bit=args.load_in_4bit - ) - - # Generate synthetic instructions - path_for_outputs = args.path_to_emails.rsplit(".jsonl", 1)[0] + "_summarized.jsonl" - num_processed_emails = 0 - start_time = time.time() - with open(path_for_outputs, "w") as f: - for i in tqdm(range(0, len(json_lines), args.batch_size)): - # TODO(armand): Fix this print for batched inference - print(f"--> Processing batch {i}/{len(json_lines)}") - batch = json_lines[i : i + args.batch_size] - summarized_emails = generate_synthetic_instructions(batch, summarizer) - num_processed_emails += len(summarized_emails) - - # Write the summarized emails to a file - for item in summarized_emails: - f.write(json.dumps(item)) - f.write("\n") - - elapsed_time = time.time() - start_time - print(f"{elapsed_time:.2f} seconds to process {len(json_lines)} emails.") - - -if __name__ == "__main__": - main() diff --git a/src/panza3/utils/documents.py b/src/panza3/utils/documents.py deleted file mode 100644 index ccd7f85..0000000 --- a/src/panza3/utils/documents.py +++ /dev/null @@ -1,46 +0,0 @@ -import copy -import json -from abc import ABC, abstractmethod -from dataclasses import asdict, dataclass -from datetime import datetime -from typing import Dict, List, Optional, Union - - -@dataclass -class Document(ABC): - summary: Optional[str] = None - - @abstractmethod - def serialize(self) -> dict: - """Convert the document to a dictionary that can be serialized to JSON.""" - pass - - @classmethod - @abstractmethod - def deserialize(cls, data: Union[str, Dict]) -> "Document": - """Convert a serialized document into a Document object.""" - pass - - -@dataclass(kw_only=True) -class Email(Document): - email: str - subject: str - thread: List[str] - date: datetime - - def serialize(self) -> dict: - dictionary = asdict(self) - dictionary["date"] = self.date.isoformat() - return dictionary - - @classmethod - def deserialize(cls, data: Union[str, Dict]) -> "Email": - if isinstance(data, str): - dictionary = json.loads(data) - elif isinstance(data, dict): - dictionary = copy.deepcopy(data) - else: - raise ValueError(f"Cannot deserialize data of type {type(data)}. Must be str or dict.") - dictionary["date"] = datetime.fromisoformat(dictionary["date"]) - return cls(**dictionary) diff --git a/src/panza3/utils/prompting.py b/src/panza3/utils/prompting.py deleted file mode 100644 index 87e18d8..0000000 --- a/src/panza3/utils/prompting.py +++ /dev/null @@ -1,175 +0,0 @@ -from typing import List, Optional, Text - -from panza.utils.documents import Email - -MISTRAL_PROMPT_START_WRAPPER = "[INST] " -MISTRAL_PROMPT_END_WRAPPER = " [/INST]" -MISTRAL_RESPONSE_START_WRAPPER = "" -MISTRAL_RESPONSE_END_WRAPPER = "" - -LLAMA3_PROMPT_START_WRAPPER = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" -LLAMA3_PROMPT_END_WRAPPER = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" -LLAMA3_RESPONSE_START_WRAPPER = "" -LLAMA3_RESPONSE_END_WRAPPER = "<|eot_id|>" - -PHI3_PROMPT_START_WRAPPER = "<|user|> " -PHI3_PROMPT_END_WRAPPER = "<|end|><|assistant|> " -PHI3_RESPONSE_START_WRAPPER = "" -PHI3_RESPONSE_END_WRAPPER = "<|end|>" - - -def create_prompt( - user_input: Text, - system_preamble: Text, - user_preamble: Text, - rag_preamble: Optional[Text] = None, - relevant_emails: Optional[List[Email]] = None, - thread_preamble: Optional[Text] = None, - thread_emails: Optional[List[Text]] = None, -) -> Text: - - if relevant_emails: - assert rag_preamble, "RAG preamble format must be provided if similar emails are provided." - rag_prompt = _create_rag_preamble_from_emails(rag_preamble, relevant_emails).strip() - else: - rag_prompt = "" - - if thread_emails: - assert thread_preamble, "Thread preamble format must be provided if thread is provided." - thread_prompt = _create_threading_preamble( - thread_preamble, thread_emails - ).strip() - else: - thread_prompt = "" - - system_preamble = system_preamble.strip() - user_preamble = user_preamble.strip() - - prompt = "" - if system_preamble: - prompt += f"{system_preamble}\n\n" - if user_preamble: - prompt += f"{user_preamble}\n\n" - if rag_prompt: - prompt += f"{rag_prompt}\n\n" - if thread_prompt: - prompt += f"{thread_prompt}\n\n" - prompt += f"Instruction: {user_input}" - - return prompt - - -def _create_rag_preamble_from_emails(rag_preamble_format: Text, emails: List[Email]) -> Text: - rag_context = _create_rag_context_from_emails(emails) - return rag_preamble_format.format(rag_context=rag_context) - - -def _create_rag_context_from_emails(emails: List[Email]) -> Text: - """Creates a RAG context from a list of relevant e-mails. - - The e-mails are formatted as follows: - - SUBJECT: - E-MAIL CONTENT: - - - --- - - SUBJECT: - E-MAIL CONTENT: - - - --- - ... - """ - - rag_context = "" - for email in emails: - rag_context += ( - # f"SUBJECT: {email.metadata['subject']}\n" # TODO(armand): Handle subject metadata - f"E-MAIL CONTENT:\n{email.page_content}\n\n---\n\n" - ) - - return rag_context - - -def _create_threading_preamble( - threading_preamble_format: Text, thread: List[Text] -) -> Text: - threading_context = _create_threading_context(thread) - return threading_preamble_format.format(threading_context=threading_context) - - -def _create_threading_context(thread: List[Text]) -> Text: - """Creates a threading context from a list of relevant e-mails. - - The e-mails are formatted as follows: - - - - --- - - - - --- - ... - """ - - threading_context = "" - for email in thread: - threading_context += f"{email}\n\n---\n\n" - - return threading_context - - -def load_preamble(path): - with open(path, "r") as file: - return file.read().strip() - - -# The user preamble must be edited by the user in order to work as intended. -# Here, we perform additional checks to make sure that that happened; if not, -# We issue a warning to the user. -def load_user_preamble(path): - with open(path, "r") as file: - lines = [l for l in file.readlines() if not l.strip().startswith("#")] - print(lines) - preamble = "".join(lines) - if "CHANGE ME" in preamble: - print( - "*" * 66 - + "\n* WARNING: User prompt preamble not customized. *\n* Please edit the preamble at prompt_preambles/user_preamble.txt *\n" - + "*" * 66 - ) - return preamble - - -def load_all_preambles(system_preamble, user_preamble, rag_preamble, thread_preamble): - system_preamble = load_preamble(system_preamble) if system_preamble else "" - user_preamble = load_user_preamble(user_preamble) if user_preamble else "" - rag_preamble = load_preamble(rag_preamble) if rag_preamble else "" - thread_preamble = load_preamble(thread_preamble) if thread_preamble else "" - return system_preamble, user_preamble, rag_preamble, thread_preamble - - -def get_model_special_tokens(model_name): - model_name = model_name.lower() - if "llama" in model_name: - prompt_start_wrapper = LLAMA3_PROMPT_START_WRAPPER - prompt_end_wrapper = LLAMA3_PROMPT_END_WRAPPER - response_start_wrapper = LLAMA3_RESPONSE_START_WRAPPER - response_end_wrapper = LLAMA3_RESPONSE_END_WRAPPER - elif "mistral" in model_name.lower(): - prompt_start_wrapper = MISTRAL_PROMPT_START_WRAPPER - prompt_end_wrapper = MISTRAL_PROMPT_END_WRAPPER - response_start_wrapper = MISTRAL_RESPONSE_START_WRAPPER - response_end_wrapper = MISTRAL_RESPONSE_END_WRAPPER - elif "phi" in model_name.lower(): - prompt_start_wrapper = PHI3_PROMPT_START_WRAPPER - prompt_end_wrapper = PHI3_PROMPT_END_WRAPPER - response_start_wrapper = PHI3_RESPONSE_START_WRAPPER - response_end_wrapper = PHI3_RESPONSE_END_WRAPPER - else: - raise ValueError(f"Presets missing for prompting model {model_name}") - - return prompt_start_wrapper, prompt_end_wrapper, response_start_wrapper, response_end_wrapper diff --git a/src/panza3/utils/rag.py b/src/panza3/utils/rag.py deleted file mode 100644 index 5653ed9..0000000 --- a/src/panza3/utils/rag.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import List - -from langchain_community.embeddings import HuggingFaceEmbeddings -from langchain_community.vectorstores import FAISS -from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore - - -def get_embeddings_model(model_name) -> Embeddings: - embeddings_model = HuggingFaceEmbeddings( - model_name=model_name, - model_kwargs={"device": "cpu"}, - encode_kwargs={"normalize_embeddings": False}, - ) - return embeddings_model - - -def create_vector_db(docs: List[Document], embeddings_model: Embeddings) -> VectorStore: - db = FAISS.from_documents(docs, embeddings_model) - return db - - -def load_vector_db_from_disk( - folder_path: str, index_name: str, embeddings_model: Embeddings -) -> VectorStore: - try: - db = FAISS.load_local( - folder_path=folder_path, - embeddings=embeddings_model, - index_name=index_name, - allow_dangerous_deserialization=True, # Allows pickle deserialization - ) - print("Faiss index loaded ") - return db - except Exception as e: - print("Fiass index loading failed \n", e) diff --git a/tests/conftest.py b/tests/conftest.py index 11d13ed..1531ba2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,22 +3,25 @@ import pytest -from panza3.entities import Email -from panza3.retriever import FaissRetriever +from panza.entities import Email +from panza.retriever import FaissRetriever @pytest.fixture def embedding_model() -> str: return "sentence-transformers/all-mpnet-base-v2" + @pytest.fixture def generative_model() -> str: return "microsoft/Phi-3-mini-4k-instruct" + @pytest.fixture def peft_model() -> str: return "microsoft/Phi-3-mini-4k-instruct" + @pytest.fixture def index_name() -> str: return "test-index" diff --git a/tests/test_entities.py b/tests/test_entities.py index 90cb28d..18935c1 100644 --- a/tests/test_entities.py +++ b/tests/test_entities.py @@ -3,7 +3,7 @@ import pytest -from panza3.entities import Email, EmailInstruction +from panza.entities import Email, EmailInstruction def test_email_serialization_deserialization(): diff --git a/tests/test_local_llm.py b/tests/test_local_llm.py index 3ba5ab9..3d21b16 100644 --- a/tests/test_local_llm.py +++ b/tests/test_local_llm.py @@ -3,8 +3,8 @@ import pytest from torch import float32 as torch_float32 -from panza3.llm import PeftLLM, TransformersLLM -from panza3.llm.local import _MISSING_LIBRARIES, LocalLLM +from panza.llm import PeftLLM, TransformersLLM +from panza.llm.local import _MISSING_LIBRARIES, LocalLLM skip_if_no_transformers = pytest.mark.skipif( "transformers" in _MISSING_LIBRARIES, reason="transformers is not installed" diff --git a/tests/test_ollama_llm.py b/tests/test_ollama_llm.py index 41d1a4c..3a5f406 100644 --- a/tests/test_ollama_llm.py +++ b/tests/test_ollama_llm.py @@ -1,8 +1,8 @@ from typing import Dict, Type from unittest.mock import patch, MagicMock -from panza3.llm.ollama import OllamaLLM -from panza3.llm import MessageType +from panza.llm.ollama import OllamaLLM +from panza.llm import MessageType import pytest MODEL: str = "test_model" diff --git a/tests/test_prompting.py b/tests/test_prompting.py index 48e1e2e..3e495a6 100644 --- a/tests/test_prompting.py +++ b/tests/test_prompting.py @@ -4,9 +4,9 @@ import pytest -from panza3.entities import Email, EmailInstruction -from panza3.prompting import EmailPromptBuilder -from panza3.retriever import FaissRetriever +from panza.entities import Email, EmailInstruction +from panza.prompting import EmailPromptBuilder +from panza.retriever import FaissRetriever def test_email_prompt_builder( @@ -29,13 +29,16 @@ def test_email_prompt_builder( instruction="Write an email.", thread=["email1", "email2", "email3"] ) - system_preamble, user_preamble, rag_preamble, thread_preamble = ( - EmailPromptBuilder.load_all_preambles( - system_preamble_path=system_preamble_path, - user_preamble_path=user_preamble_path, - rag_preamble_path=rag_preamble_path, - thread_preamble_path=thread_preamble_path, - ) + ( + system_preamble, + user_preamble, + rag_preamble, + thread_preamble, + ) = EmailPromptBuilder.load_all_preambles( + system_preamble_path=system_preamble_path, + user_preamble_path=user_preamble_path, + rag_preamble_path=rag_preamble_path, + thread_preamble_path=thread_preamble_path, ) prompt_builder = EmailPromptBuilder( diff --git a/tests/test_retriever.py b/tests/test_retriever.py index 347df94..af0ec74 100644 --- a/tests/test_retriever.py +++ b/tests/test_retriever.py @@ -3,8 +3,8 @@ import pytest -from panza3.entities import Email -from panza3.retriever import FaissRetriever +from panza.entities import Email +from panza.retriever import FaissRetriever def get_faiss_retriever( diff --git a/tests/test_writer.py b/tests/test_writer.py index 48e9f5d..e8e6100 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -2,10 +2,10 @@ import pytest -from panza3.entities import EmailInstruction -from panza3.llm import LLM -from panza3.prompting import EmailPromptBuilder -from panza3.writer import PanzaWriter +from panza.entities import EmailInstruction +from panza.llm import LLM +from panza.prompting import EmailPromptBuilder +from panza.writer import PanzaWriter def test_email_writer(): From be76c39c47b3741be8b2ab3ee5892febaffa75c0 Mon Sep 17 00:00:00 2001 From: Andrej Jovanovic Date: Thu, 14 Nov 2024 11:17:22 +0100 Subject: [PATCH 109/112] Clear ollama and web use-case --- configs/interfaces/web.yaml | 2 - configs/writer/llm/ollama.yaml | 6 -- pyproject.toml | 1 - scripts/run_ollama_services.sh | 36 ----------- scripts/run_panza.sh | 29 --------- scripts/run_panza_gui.sh | 31 --------- scripts/run_panza_ollama.sh | 31 --------- scripts/run_services.sh | 37 ----------- src/panza/interface/__init__.py | 3 +- src/panza/interface/json.py | 7 +- src/panza/interface/web.py | 79 ----------------------- src/panza/llm/__init__.py | 3 +- src/panza/llm/ollama.py | 93 --------------------------- tests/test_ollama_llm.py | 72 --------------------- tests/test_prompting.py | 110 -------------------------------- 15 files changed, 7 insertions(+), 533 deletions(-) delete mode 100644 configs/interfaces/web.yaml delete mode 100644 configs/writer/llm/ollama.yaml delete mode 100755 scripts/run_ollama_services.sh delete mode 100755 scripts/run_panza.sh delete mode 100755 scripts/run_panza_gui.sh delete mode 100755 scripts/run_panza_ollama.sh delete mode 100755 scripts/run_services.sh delete mode 100644 src/panza/interface/web.py delete mode 100644 src/panza/llm/ollama.py delete mode 100644 tests/test_ollama_llm.py delete mode 100644 tests/test_prompting.py diff --git a/configs/interfaces/web.yaml b/configs/interfaces/web.yaml deleted file mode 100644 index 59cfce9..0000000 --- a/configs/interfaces/web.yaml +++ /dev/null @@ -1,2 +0,0 @@ -port: 5001 -_target_: panza.interface.PanzaWebService \ No newline at end of file diff --git a/configs/writer/llm/ollama.yaml b/configs/writer/llm/ollama.yaml deleted file mode 100644 index e384a6b..0000000 --- a/configs/writer/llm/ollama.yaml +++ /dev/null @@ -1,6 +0,0 @@ -defaults: - - sampling: random - -_target_: panza.llm.OllamaLLM -name: "custom" -gguf_file: "custom.gguf" diff --git a/pyproject.toml b/pyproject.toml index e7d70a4..9ac7878 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,6 @@ version = "2024.08.14" description = "A personal email assistant, trained and running on-device." dependencies = [ "torch==2.2.2", - "ollama", "omegaconf", "fastapi", "uvicorn", diff --git a/scripts/run_ollama_services.sh b/scripts/run_ollama_services.sh deleted file mode 100755 index 61857be..0000000 --- a/scripts/run_ollama_services.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL="custom" - -DEVICE="cuda:1" -DTYPE="bf16" - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/ollama_service_inference.py -python ${INFERENCE_SCRIPT} \ - --model=${MODEL} \ - --device=${DEVICE} \ - --dtype=${DTYPE} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} \ - ${USE_4BIT_QUANT} \ No newline at end of file diff --git a/scripts/run_panza.sh b/scripts/run_panza.sh deleted file mode 100755 index 941515a..0000000 --- a/scripts/run_panza.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL=${PANZA_GENERATIVE_MODEL} # Replace this with the checkpoint you want to use! - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/gui_inference.py -python ${INFERENCE_SCRIPT} \ - --model=${MODEL} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} diff --git a/scripts/run_panza_gui.sh b/scripts/run_panza_gui.sh deleted file mode 100755 index 911c32c..0000000 --- a/scripts/run_panza_gui.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL=${PANZA_GENERATIVE_MODEL} # Replace this with the checkpoint you want to use! - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/gui_inference.py -python ${INFERENCE_SCRIPT} \ - --model=${MODEL} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} \ - ${USE_4BIT_QUANT} diff --git a/scripts/run_panza_ollama.sh b/scripts/run_panza_ollama.sh deleted file mode 100755 index d63dce8..0000000 --- a/scripts/run_panza_ollama.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL=${PANZA_GENERATIVE_MODEL} # Replace this with the checkpoint you want to use! - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/ollama_inference.py -python ${INFERENCE_SCRIPT} \ - --model=llama3.1 \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_RAG} \ - ${USE_4BIT_QUANT} diff --git a/scripts/run_services.sh b/scripts/run_services.sh deleted file mode 100755 index d054232..0000000 --- a/scripts/run_services.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -source config.sh - -MODEL="../checkpoints/models/panza_seanyang711_llama3_bf16-bs8-rosa_wl16_d0.01_1grads_mean_squared_r8_loralr1e-5_alpha16-lr1e-5-epochs5-wu8-seed42-PREAMBLE-16296" -MODEL="../checkpoints/models/panza_llama3_bf16-bs8-rosa_wl16_d0_1grads_mean_squared_r8_loralr1e-5_alpha16-lr1e-5-epochs5-wu8-seed42-PREAMBLE-31921" -MODEL="../checkpoints/models/panza_jen.iofinova-Meta-Llama-3-8B-Instruct-bf16-bs8-fft-lr1e-05-3ep-seed41" - -DEVICE="cuda:1" -DTYPE="auto" - -for ARGUMENT in "$@" -do - KEY=$(echo $ARGUMENT | cut -f1 -d=) - - KEY_LENGTH=${#KEY} - VALUE="${ARGUMENT:$KEY_LENGTH+1}" - - export "$KEY"="$VALUE" -done - -USE_RAG=$([ "${PANZA_DISABLE_RAG_INFERENCE}" = "1" ] && echo "" || echo "--use-rag") -USE_4BIT_QUANT=$([ "${MODEL_PRECISION}" = "4bit" ] && echo "--load-in-4bit" || echo "") - -INFERENCE_SCRIPT=${PANZA_WORKSPACE}/src/panza/evaluation/service_inference.py -python ${INFERENCE_SCRIPT} \ - --model=${MODEL} \ - --device=${DEVICE} \ - --dtype=${DTYPE} \ - --system-preamble=${PANZA_SYSTEM_PREAMBLE_PATH} \ - --user-preamble=${PANZA_USER_PREAMBLE_PATH} \ - --rag-preamble=${PANZA_RAG_PREAMBLE_PATH} \ - --embedding-model=${PANZA_EMBEDDING_MODEL} \ - --db-path=${PANZA_DATA_DIR} \ - --index-name=${PANZA_USERNAME} \ - --rag-relevance-threshold=${PANZA_RAG_RELEVANCE_THRESHOLD} \ - ${USE_4BIT_QUANT} diff --git a/src/panza/interface/__init__.py b/src/panza/interface/__init__.py index 6d25a4b..3af9505 100644 --- a/src/panza/interface/__init__.py +++ b/src/panza/interface/__init__.py @@ -1,6 +1,5 @@ -from .web import PanzaWebService from .cli import PanzaCLI from .gui import PanzaGUI from .json import PanzaJSON -__all__ = ["PanzaWebService", "PanzaCLI", "PanzaGUI", "PanzaJSON"] +__all__ = ["PanzaCLI", "PanzaGUI", "PanzaJSON"] diff --git a/src/panza/interface/json.py b/src/panza/interface/json.py index f6d190a..4169c12 100644 --- a/src/panza/interface/json.py +++ b/src/panza/interface/json.py @@ -126,10 +126,13 @@ def assemble_responses(self, prompts_json, batch_size, use_thread, responses_per if use_thread: instructions = list(zip(prompts, threads)) else: - instructions = list(zip(prompts, [None] * len(prompts))) + instructions = list(zip(prompts, [[]] * len(prompts))) outputs, full_prompts = self.writer.run_batch( - [EmailInstruction(user_input) for user_input in instructions], + [ + EmailInstruction(user_input[0], thread=user_input[1]) + for user_input in instructions + ], return_prompt=True, ) diff --git a/src/panza/interface/web.py b/src/panza/interface/web.py deleted file mode 100644 index f925a70..0000000 --- a/src/panza/interface/web.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from typing import Annotated, Generator, List -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from fastapi import FastAPI, HTTPException, Header -from fastapi.responses import StreamingResponse -from panza.entities.instruction import EmailInstruction, Instruction -from panza.writer import PanzaWriter -import uvicorn -from pydantic import BaseModel -from dotenv import load_dotenv -import threading - - -class Request(BaseModel): - text: str - - -class PanzaWebService: - def __init__(self, writer: PanzaWriter, port: int): - self.app = FastAPI() - self.writer = writer - self.port = port - self._setup_routes() - load_dotenv() - self._add_cors() - self.api_keys = self._get_valid_api_keys() - self._start_server() - self.server_thread = None - - def _add_cors(self): - self.app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - def _get_valid_api_keys(self) -> List[str]: - return os.getenv("API_KEYS").split(",") - - def _streamer(self, stream): - for chunk in stream: - yield chunk - - def _predict(self, input: str) -> Generator: - instruction: Instruction = EmailInstruction(input) - stream: Generator = self.writer.run(instruction, stream=True) - return stream - - def _setup_routes(self): - @self.app.options("/generate") - def options(): - return {"methods": ["POST"]} - - @self.app.post("/generate") - def generate_text(request: Request, x_api_key: Annotated[str | None, Header()]): - if x_api_key not in self.api_keys: - raise HTTPException(status_code=401, detail="Invalid API key.") - stream = self._predict(request.text) - return StreamingResponse(self._streamer(stream), media_type="text/event-stream") - - def _start_server(self): - self.server_thread = threading.Thread( - target=uvicorn.run, - args=(self.app,), - kwargs={"port": self.port}, - daemon=False, - ) - self.server_thread.start() - print("Panza web server started.") - - def _stop_server(self): - if self.server_thread is None: - return - self.server_thread.join() - self.server_thread = None - print("Panza web server stopped.") diff --git a/src/panza/llm/__init__.py b/src/panza/llm/__init__.py index acca5fd..477919b 100644 --- a/src/panza/llm/__init__.py +++ b/src/panza/llm/__init__.py @@ -1,5 +1,4 @@ from .base import LLM, ChatHistoryType, MessageType from .local import PeftLLM, TransformersLLM -from .ollama import OllamaLLM -__all__ = ["LLM", "ChatHistoryType", "MessageType", "OllamaLLM", "TransformersLLM", "PeftLLM"] +__all__ = ["LLM", "ChatHistoryType", "MessageType", "TransformersLLM", "PeftLLM"] diff --git a/src/panza/llm/ollama.py b/src/panza/llm/ollama.py deleted file mode 100644 index e6f7223..0000000 --- a/src/panza/llm/ollama.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -from typing import Dict, Iterator, List -from .base import LLM, ChatHistoryType - -_MISSING_LIBRARIES = [] - -try: - import ollama -except ImportError: - ollama = None - _MISSING_LIBRARIES.append("ollama") - - -class OllamaLLM(LLM): - def __init__(self, name: str, gguf_file: str, sampling_parameters: Dict): - """ - Loads and serves the model from the GGUF file into Ollama with the given name and sampling parameters. - """ - super().__init__(name, sampling_parameters) - self.gguf_file = gguf_file - self.sampling_parameters = sampling_parameters - - if not self._is_ollama_running(): - self._start_ollama() - - if not self._is_model_loaded(): - self._load_model() - - def _is_ollama_running(self) -> bool: - try: - ollama.list() - return True - except: - return False - - def _start_ollama(self) -> None: - # run the bash command "ollama list" which causes Ollama to start if it is not already running - try: - os.system("/bin/bash -c 'ollama list'") - except: - raise Exception("Ollama failed to start.") - - def _is_model_loaded(self) -> bool: - for model in ollama.list()["models"]: - # model name is everything before the colon - name = model["name"].split(":")[0] - if name == self.name: - return True - return False - - def _make_modelfile_parameters(self) -> str: - if self.sampling_parameters is None or self.sampling_parameters["do_sample"] == False: - return "" - return f""" - PARAMETER temperature {self.sampling_parameters["temperature"]} - PARAMETER top_k {self.sampling_parameters["top_k"]} - PARAMETER top_p {self.sampling_parameters["top_p"]} - PARAMETER num_predict {self.sampling_parameters["max_new_tokens"]} - """ - - def _load_model(self) -> None: - modelfile = f""" - FROM {self.gguf_file} - {self._make_modelfile_parameters()} - """ - try: - ollama.create(model={self.name}, modelfile=modelfile, stream=True) - print("Loaded a new mode into Ollama.") - except: - raise Exception(f"Failed to load model {self.name} with GGUF file {self.gguf_file}.") - - def _get_message(self, response) -> str: - return response["message"]["content"] - - def _check_installation(self) -> None: - if ollama is None: - raise ImportError( - "The 'ollama' library is not installed. Please install it with 'pip install ollama'." - ) - - def chat(self, messages: ChatHistoryType | List[ChatHistoryType]) -> List[str]: - response = ollama.chat(model=self.name, messages=messages, stream=False) - return [self._get_message(response)] - - def chat_stream(self, messages: ChatHistoryType) -> Iterator[str]: - stream = ollama.chat( - model=self.name, - messages=messages, - stream=True, - ) - # return a new stream that only contains the message content - for chunk in stream: - yield self._get_message(chunk) diff --git a/tests/test_ollama_llm.py b/tests/test_ollama_llm.py deleted file mode 100644 index 3a5f406..0000000 --- a/tests/test_ollama_llm.py +++ /dev/null @@ -1,72 +0,0 @@ -from typing import Dict, Type -from unittest.mock import patch, MagicMock - -from panza.llm.ollama import OllamaLLM -from panza.llm import MessageType -import pytest - -MODEL: str = "test_model" -GGUF_FILE: str = "test.gguf" -SAMPLING_PARAMS: Dict = {"param1": "val1"} -REQUEST: MessageType = {"content": "write an email"} -RESPONSE: str = "here is an email" -RESPONSE_OBJ = {"message": {"content": RESPONSE}} - - -@patch("os.system") -@patch("ollama.list") -def test_ollama_llm_init_launches_ollama(ollama_list: MagicMock, os_system: MagicMock): - # When Ollama isn't running, the __init__() should start it by calling os.system(). To simulate Ollama not running, we'll mock the ollama.list() method to raise an exception. - ollama_list.side_effect = Exception("Ollama not running") - try: - OllamaLLM("test", "test.gguf", {}) - except: - pass - os_system.assert_called_once_with("/bin/bash -c 'ollama list'") - - -@patch("ollama.create") -@patch("ollama.list") -def test_ollama_llm_init_creates_model(ollama_list: MagicMock, ollama_create: MagicMock): - # When the given module isn't loaded into Ollama yet, the __init__() should load it by calling ollama.create(). To simulate the module not being loaded, we'll mock the ollama.list() method to return an empty list. - ollama_list.return_value = {"models": []} - OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - ollama_create.assert_called_once() - - -# Mock all external calls to prevent side effects -@patch("os.system") -@patch("ollama.list") -@patch("ollama.create") -def test_ollama_llm_init(*args): - # Make sure __init__() sets all local variables correctly - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - assert ollama_llm.gguf_file == GGUF_FILE - assert ollama_llm.sampling_params == SAMPLING_PARAMS - assert ollama_llm.name == MODEL - - -# Mock all external calls to prevent side effects -@patch("os.system") -@patch("ollama.list") -@patch("ollama.create") -@patch("ollama.chat") -def test_ollama_llm_chat(ollama_chat: MagicMock, *args): - ollama_chat.return_value = RESPONSE_OBJ - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - assert ollama_llm.chat(REQUEST) == [RESPONSE] - ollama_chat.assert_called_once() - - -# Mock all external calls to prevent side effects -@patch("os.system") -@patch("ollama.list") -@patch("ollama.create") -@patch("ollama.chat") -def test_ollama_llm_chat_stream(ollama_chat: MagicMock, *args): - expected_iterator = iter([RESPONSE_OBJ]) - ollama_chat.return_value = expected_iterator - ollama_llm = OllamaLLM(MODEL, GGUF_FILE, SAMPLING_PARAMS) - # make sure that ollama_llm.chat() returns a generator that yields the expected response - assert list(ollama_llm.chat_stream(REQUEST)) == [RESPONSE] - ollama_chat.assert_called_once_with(model=MODEL, messages=REQUEST, stream=True) diff --git a/tests/test_prompting.py b/tests/test_prompting.py deleted file mode 100644 index 3e495a6..0000000 --- a/tests/test_prompting.py +++ /dev/null @@ -1,110 +0,0 @@ -from datetime import datetime -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -from panza.entities import Email, EmailInstruction -from panza.prompting import EmailPromptBuilder -from panza.retriever import FaissRetriever - - -def test_email_prompt_builder( - system_preamble_path: Path, - user_preamble_path: Path, - rag_preamble_path: Path, - thread_preamble_path: Path, -): - # TODO: Split into multiple tests - - # Patch the retrieve method to return a list of emails - mock_retriever = MagicMock(spec=FaissRetriever) - emails = [ - Email(email=f"email{i}", subject=f"subject{i}", thread=[f"thread{i}"], date=datetime.now()) - for i in range(3) - ] - mock_retriever.retrieve.return_value = emails - - instruction = EmailInstruction( - instruction="Write an email.", thread=["email1", "email2", "email3"] - ) - - ( - system_preamble, - user_preamble, - rag_preamble, - thread_preamble, - ) = EmailPromptBuilder.load_all_preambles( - system_preamble_path=system_preamble_path, - user_preamble_path=user_preamble_path, - rag_preamble_path=rag_preamble_path, - thread_preamble_path=thread_preamble_path, - ) - - prompt_builder = EmailPromptBuilder( - retriever=mock_retriever, - system_preamble=system_preamble, - user_preamble=user_preamble, - rag_preamble=rag_preamble, - thread_preamble=thread_preamble, - number_rag_emails=3, - rag_relevance_threshold=0.0, - number_thread_emails=1, - ) - - rag_prompt = prompt_builder._create_rag_preamble_from_emails(emails=emails) - - assert rag_prompt == ( - "RAG PREAMBLE:\n\n" - + "E-MAIL CONTENT:\nemail0\n\n---\n\n" - + "E-MAIL CONTENT:\nemail1\n\n---\n\n" - + "E-MAIL CONTENT:\nemail2\n\n---\n\n" - ) - - thread_prompt = prompt_builder._create_threading_preamble(thread=instruction.thread) - - assert thread_prompt == ( - "THREAD PREAMBLE:\n\n" + "email1\n\n---\n\n" + "email2\n\n---\n\n" + "email3\n\n---\n\n" - ) - - # Test full prompt - prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=True, use_thread=True) - assert prompt == ( - "\n\n" - + "\n\n" - + "RAG PREAMBLE:\n\n" - + "E-MAIL CONTENT:\nemail0\n\n---\n\n" - + "E-MAIL CONTENT:\nemail1\n\n---\n\n" - + "E-MAIL CONTENT:\nemail2\n\n---\n\n" - + "THREAD PREAMBLE:\n\n" - + "email1\n\n---\n\n" - + "Instruction: Write an email." - ) - - # Test prompt without RAG - prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=False, use_thread=True) - assert prompt == ( - "\n\n" - + "\n\n" - + "THREAD PREAMBLE:\n\n" - + "email1\n\n---\n\n" - + "Instruction: Write an email." - ) - - # Test prompt without thread - prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=True, use_thread=False) - assert prompt == ( - "\n\n" - + "\n\n" - + "RAG PREAMBLE:\n\n" - + "E-MAIL CONTENT:\nemail0\n\n---\n\n" - + "E-MAIL CONTENT:\nemail1\n\n---\n\n" - + "E-MAIL CONTENT:\nemail2\n\n---\n\n" - + "Instruction: Write an email." - ) - - # Test prompt without RAG and thread - prompt = prompt_builder.build_prompt(instruction=instruction, use_rag=False, use_thread=False) - assert prompt == ( - "\n\n" + "\n\n" + "Instruction: Write an email." - ) From 677dd8ae37a694959155ae4df955ec8d531f6421 Mon Sep 17 00:00:00 2001 From: Jen Iofinova Date: Mon, 18 Nov 2024 11:04:35 +0100 Subject: [PATCH 110/112] Update README.md remove confusing period. --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a7b983f..af1f125 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,7 @@ Examples: ``` bash CUDA_VISIBLE_DEVICES=X ./train_rosa.sh # Will use the default parameters. -CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep. +CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-6 finetuning.max_duration=7ep ```
FAQs. @@ -191,8 +191,8 @@ CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e- ### Step 5: Launch Panza! -- To run Panza after a full training run, try something like `CUDA_VISIBLE_DEVICES=0 ./runner.sh user=USERNAME interfaces=cli writer/llm=transformers`. -- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` TODO Armand: can we fix this? +- To run Panza after a full training run, run a command like `CUDA_VISIBLE_DEVICES=0 ./runner.sh user=USERNAME interfaces=cli writer/llm=transformers model=latest`. +- To run Panza after a RoSA or LoRA training run, replace `writer/llm=transformers` with `writer/llm=peft` :email: **Have fun with your new email writing assistant!** :email: From e1e8e6ddaea6a1126e5c7fc60af992e8d259c35a Mon Sep 17 00:00:00 2001 From: Jen Iofinova Date: Mon, 18 Nov 2024 11:23:24 +0100 Subject: [PATCH 111/112] Update README.md Add instructions for quantized training --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index af1f125..861f963 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,13 @@ CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e-
+On a smaller GPU, it may be necessary to further train in lower precision (QRoSA). This can be run as follows: + +``` bash +./train_rosa.sh finetuning.precision=fp32 finetuning.model.weight_bias_dtype=4bit finetuning.model.compute_dtype=fp32 +``` + + ### Step 5: Launch Panza! From 525d0e35f7744754f0fba6aa5c594395c272ea58 Mon Sep 17 00:00:00 2001 From: Eugenia Iofinova Date: Mon, 18 Nov 2024 14:06:35 +0100 Subject: [PATCH 112/112] correct README for quantized training --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 861f963..9f05343 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,7 @@ CUDA_VISIBLE_DEVICES=X ./train_rosa.sh finetuning.lr=1e-6 finetuning.rosa_lr=1e- On a smaller GPU, it may be necessary to further train in lower precision (QRoSA). This can be run as follows: ``` bash -./train_rosa.sh finetuning.precision=fp32 finetuning.model.weight_bias_dtype=4bit finetuning.model.compute_dtype=fp32 +./train_rosa.sh finetuning.precision=amp_bf16 finetuning.model.weight_bias_dtype=4bit ```