Skip to content

Commit

Permalink
Merge branch 'misc/add-deployment-llm-complete' of https://github.com…
Browse files Browse the repository at this point in the history
…/zenml-io/zenml-projects into misc/add-deployment-llm-complete
  • Loading branch information
safoinme committed Dec 10, 2024
2 parents 1d93385 + 7bf314e commit a5c8be6
Show file tree
Hide file tree
Showing 10 changed files with 47 additions and 30 deletions.
5 changes: 3 additions & 2 deletions llm-complete-guide/steps/chunk_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from structures import Document
from utils.llm_utils import split_documents
from zenml import log_artifact_metadata, step
from zenml import log_metadata, step
from zenml.logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -137,8 +137,9 @@ def chunk_documents(
logger.info(
f"Number of documents after chunking: {num_docs_after_chunking}"
)
log_artifact_metadata(
log_metadata(
artifact_name="chunked_documents",
infer_artifact=True,
metadata={
"before_chunking_count": num_docs_before_chunking,
"after_chunking_count": num_docs_after_chunking,
Expand Down
12 changes: 7 additions & 5 deletions llm-complete-guide/steps/eval_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import matplotlib.pyplot as plt
from datasets import Dataset
from PIL import Image
from zenml import log_artifact_metadata, step
from zenml import log_metadata, step


class PIIDetector:
Expand Down Expand Up @@ -305,8 +305,9 @@ def eval_pii(
"dates_found": train_results["statistics"]["total_findings"]["dates"],
"ips_found": train_results["statistics"]["total_findings"]["ips"],
}
log_artifact_metadata(
metadata=train_metadata, artifact_name="train_pii_results"
log_metadata(
metadata=train_metadata, artifact_name="train_pii_results",
infer_artifact=True
)

test_metadata = {
Expand All @@ -320,8 +321,9 @@ def eval_pii(
"dates_found": test_results["statistics"]["total_findings"]["dates"],
"ips_found": test_results["statistics"]["total_findings"]["ips"],
}
log_artifact_metadata(
metadata=test_metadata, artifact_name="test_pii_results"
log_metadata(
metadata=test_metadata, artifact_name="test_pii_results",
infer_artifact=True
)

pii_chart = plot_pii_results(train_results, test_results)
Expand Down
13 changes: 7 additions & 6 deletions llm-complete-guide/steps/finetune_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.util import cos_sim
from zenml import ArtifactConfig, log_model_metadata, step
from zenml import ArtifactConfig, log_metadata, step
from zenml.client import Client
from zenml.utils.cuda_utils import cleanup_gpu_memory

Expand Down Expand Up @@ -168,8 +168,8 @@ def evaluate_base_model(
for dim in EMBEDDINGS_MODEL_MATRYOSHKA_DIMS
}

log_model_metadata(
metadata={"base_model_eval": base_model_eval},
log_metadata(
metadata={"base_model_eval": base_model_eval}, infer_model=True
)

return results
Expand Down Expand Up @@ -201,8 +201,8 @@ def evaluate_finetuned_model(
for dim in EMBEDDINGS_MODEL_MATRYOSHKA_DIMS
}

log_model_metadata(
metadata={"finetuned_model_eval": finetuned_model_eval},
log_metadata(
metadata={"finetuned_model_eval": finetuned_model_eval}, infer_model=True
)

return results
Expand Down Expand Up @@ -298,7 +298,8 @@ def finetune(
token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"],
)

log_model_metadata(
log_metadata(
infer_model=True,
metadata={
"training_params": {
"num_train_epochs": epochs,
Expand Down
14 changes: 9 additions & 5 deletions llm-complete-guide/steps/finetune_embeddings_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from torch.nn import CosineSimilarity
from torch.utils.data import DataLoader
from utils.visualization_utils import create_comparison_chart
from zenml import log_artifact_metadata, step
from zenml import log_metadata, step
from zenml.logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -79,12 +79,14 @@ def load_datasets(
print("train_dataset_length_raw", len(train_dataset))
print("test_dataset_length_raw", len(test_dataset))

log_artifact_metadata(
log_metadata(
artifact_name="train_dataset",
infer_artifact=True,
metadata={"row_count": len(train_dataset)},
)
log_artifact_metadata(
log_metadata(
artifact_name="test_dataset",
infer_artifact=True,
metadata={"row_count": len(test_dataset)},
)

Expand Down Expand Up @@ -187,8 +189,9 @@ def train_model(
warmup_steps=warmup_steps,
)

log_artifact_metadata(
log_metadata(
artifact_name="trained_model",
infer_artifact=True,
metadata={
"model_path": model_path,
"num_epochs": num_epochs,
Expand Down Expand Up @@ -280,8 +283,9 @@ def evaluate_model(
finetuned_similarity=finetuned_avg_sim,
)

log_artifact_metadata(
log_metadata(
artifact_name="evaluation_results",
infer_artifact=True,
metadata={
"pretrained_average_similarity": {
"value": pretrained_avg_sim,
Expand Down
5 changes: 3 additions & 2 deletions llm-complete-guide/steps/generate_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from rich import print
from structures import Document
from utils.openai_utils import get_openai_api_key
from zenml import log_artifact_metadata, step
from zenml import log_metadata, step
from zenml.logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -160,8 +160,9 @@ def generate_questions(
f"Generated {len(final_df)} questions for {len(documents)} documents."
)

log_artifact_metadata(
log_metadata(
artifact_name="generated_questions",
infer_artifact=True,
metadata={
"num_documents": len(documents),
"num_questions_generated": len(final_df),
Expand Down
5 changes: 3 additions & 2 deletions llm-complete-guide/steps/markdown_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import polars as pl
from constants import FILES_TO_IGNORE
from zenml import log_artifact_metadata, step
from zenml import log_metadata, step
from zenml.logger import get_logger

logger = get_logger(__name__)
Expand Down Expand Up @@ -61,8 +61,9 @@ def load_markdown_files(
f"Subfolder '{subfolder}' not found in the cloned repository."
)

log_artifact_metadata(
log_metadata(
artifact_name="markdown_files",
infer_artifact=True,
metadata={
"num_markdown_files": len(markdown_files),
"columns": "filename, page_content",
Expand Down
14 changes: 9 additions & 5 deletions llm-complete-guide/steps/populate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from sentence_transformers import SentenceTransformer
from structures import Document
from utils.llm_utils import get_db_conn, get_es_client, split_documents
from zenml import ArtifactConfig, log_artifact_metadata, step, log_model_metadata
from zenml import ArtifactConfig, log_metadata, step, log_metadata
from zenml.metadata.metadata_types import Uri
from zenml.client import Client
from constants import SECRET_NAME
Expand Down Expand Up @@ -515,8 +515,9 @@ def preprocess_documents(
Exception: If an error occurs during preprocessing.
"""
try:
log_artifact_metadata(
log_metadata(
artifact_name="split_chunks",
infer_artifact=True,
metadata={
"chunk_size": CHUNK_SIZE,
"chunk_overlap": CHUNK_OVERLAP,
Expand All @@ -536,8 +537,9 @@ def preprocess_documents(
histogram_chart: Image.Image = create_histogram(stats)
bar_chart: Image.Image = create_bar_chart(stats)

log_artifact_metadata(
log_metadata(
artifact_name="split_chunks",
infer_artifact=True,
metadata=stats,
)

Expand Down Expand Up @@ -568,8 +570,9 @@ def generate_embeddings(
try:
model = SentenceTransformer(EMBEDDINGS_MODEL)

log_artifact_metadata(
log_metadata(
artifact_name="documents_with_embeddings",
infer_artifact=True,
metadata={
"embedding_type": EMBEDDINGS_MODEL,
"embedding_dimensionality": EMBEDDING_DIMENSIONALITY,
Expand Down Expand Up @@ -828,7 +831,8 @@ def _log_metadata(index_type: IndexType) -> None:
"dbname": "postgres",
}

log_model_metadata(
log_metadata(
infer_model=True,
metadata={
"embeddings": {
"model": EMBEDDINGS_MODEL,
Expand Down
5 changes: 3 additions & 2 deletions llm-complete-guide/steps/url_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import json

from typing_extensions import Annotated
from zenml import ArtifactConfig, log_artifact_metadata, step
from zenml import ArtifactConfig, log_metadata, step

from steps.url_scraping_utils import get_all_pages

Expand Down Expand Up @@ -58,8 +58,9 @@ def url_scraper(
# website_urls = get_all_pages(website_url)
# all_urls = docs_urls + website_urls + examples_readme_urls
all_urls = docs_urls
log_artifact_metadata(
log_metadata(
artifact_name="urls",
infer_artifact=True,
metadata={
"count": len(all_urls),
},
Expand Down
2 changes: 2 additions & 0 deletions llm-complete-guide/steps/visualize_chat.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import Any, Dict

from typing_extensions import Annotated
from zenml import get_step_context, log_metadata, step
from zenml.metadata.metadata_types import Uri
from zenml.types import HTMLString
from zenml.utils.dashboard_utils import get_model_version_url


@step(enable_cache=False)
def create_chat_interface(
deployment_info: Dict[str, Any],
Expand Down
2 changes: 1 addition & 1 deletion llm-complete-guide/utils/llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def find_vectorstore_name() -> str:
client = Client()
model = client.get_model_version(ZENML_CHATBOT_MODEL, model_version_name_or_number_or_id="v0.68.1-dev")

return model.run_metadata["vector_store"].value["name"]
return model.run_metadata["vector_store"]["name"]


def rerank_documents(
Expand Down

0 comments on commit a5c8be6

Please sign in to comment.