From 7bf314e3db8063a409b5fa13ab163ed746bc7cff Mon Sep 17 00:00:00 2001 From: Alexej Penner Date: Tue, 10 Dec 2024 15:40:30 +0100 Subject: [PATCH] Updated deprecated code --- llm-complete-guide/steps/chunk_documents.py | 5 +++-- llm-complete-guide/steps/eval_pii.py | 12 +++++++----- llm-complete-guide/steps/finetune_embeddings.py | 13 +++++++------ .../steps/finetune_embeddings_legacy.py | 14 +++++++++----- llm-complete-guide/steps/generate_questions.py | 5 +++-- llm-complete-guide/steps/markdown_loader.py | 5 +++-- llm-complete-guide/steps/populate_index.py | 14 +++++++++----- llm-complete-guide/steps/url_scraper.py | 5 +++-- llm-complete-guide/steps/visualize_chat.py | 2 +- llm-complete-guide/utils/llm_utils.py | 2 +- 10 files changed, 46 insertions(+), 31 deletions(-) diff --git a/llm-complete-guide/steps/chunk_documents.py b/llm-complete-guide/steps/chunk_documents.py index 43983604..8a48c26b 100644 --- a/llm-complete-guide/steps/chunk_documents.py +++ b/llm-complete-guide/steps/chunk_documents.py @@ -21,7 +21,7 @@ ) from structures import Document from utils.llm_utils import split_documents -from zenml import log_artifact_metadata, step +from zenml import log_metadata, step from zenml.logger import get_logger logger = get_logger(__name__) @@ -137,8 +137,9 @@ def chunk_documents( logger.info( f"Number of documents after chunking: {num_docs_after_chunking}" ) - log_artifact_metadata( + log_metadata( artifact_name="chunked_documents", + infer_artifact=True, metadata={ "before_chunking_count": num_docs_before_chunking, "after_chunking_count": num_docs_after_chunking, diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py index b81237f3..460e0e35 100644 --- a/llm-complete-guide/steps/eval_pii.py +++ b/llm-complete-guide/steps/eval_pii.py @@ -6,7 +6,7 @@ import matplotlib.pyplot as plt from datasets import Dataset from PIL import Image -from zenml import log_artifact_metadata, step +from zenml import log_metadata, step class PIIDetector: @@ -305,8 +305,9 @@ def eval_pii( "dates_found": train_results["statistics"]["total_findings"]["dates"], "ips_found": train_results["statistics"]["total_findings"]["ips"], } - log_artifact_metadata( - metadata=train_metadata, artifact_name="train_pii_results" + log_metadata( + metadata=train_metadata, artifact_name="train_pii_results", + infer_artifact=True ) test_metadata = { @@ -320,8 +321,9 @@ def eval_pii( "dates_found": test_results["statistics"]["total_findings"]["dates"], "ips_found": test_results["statistics"]["total_findings"]["ips"], } - log_artifact_metadata( - metadata=test_metadata, artifact_name="test_pii_results" + log_metadata( + metadata=test_metadata, artifact_name="test_pii_results", + infer_artifact=True ) pii_chart = plot_pii_results(train_results, test_results) diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py index 3523e622..8ef535b4 100644 --- a/llm-complete-guide/steps/finetune_embeddings.py +++ b/llm-complete-guide/steps/finetune_embeddings.py @@ -47,7 +47,7 @@ ) from sentence_transformers.training_args import BatchSamplers from sentence_transformers.util import cos_sim -from zenml import ArtifactConfig, log_model_metadata, step +from zenml import ArtifactConfig, log_metadata, step from zenml.client import Client from zenml.utils.cuda_utils import cleanup_gpu_memory @@ -168,8 +168,8 @@ def evaluate_base_model( for dim in EMBEDDINGS_MODEL_MATRYOSHKA_DIMS } - log_model_metadata( - metadata={"base_model_eval": base_model_eval}, + log_metadata( + metadata={"base_model_eval": base_model_eval}, infer_model=True ) return results @@ -201,8 +201,8 @@ def evaluate_finetuned_model( for dim in EMBEDDINGS_MODEL_MATRYOSHKA_DIMS } - log_model_metadata( - metadata={"finetuned_model_eval": finetuned_model_eval}, + log_metadata( + metadata={"finetuned_model_eval": finetuned_model_eval}, infer_model=True ) return results @@ -298,7 +298,8 @@ def finetune( token=zenml_client.get_secret(SECRET_NAME).secret_values["hf_token"], ) - log_model_metadata( + log_metadata( + infer_model=True, metadata={ "training_params": { "num_train_epochs": epochs, diff --git a/llm-complete-guide/steps/finetune_embeddings_legacy.py b/llm-complete-guide/steps/finetune_embeddings_legacy.py index 7136e784..abda6c24 100644 --- a/llm-complete-guide/steps/finetune_embeddings_legacy.py +++ b/llm-complete-guide/steps/finetune_embeddings_legacy.py @@ -25,7 +25,7 @@ from torch.nn import CosineSimilarity from torch.utils.data import DataLoader from utils.visualization_utils import create_comparison_chart -from zenml import log_artifact_metadata, step +from zenml import log_metadata, step from zenml.logger import get_logger logger = get_logger(__name__) @@ -79,12 +79,14 @@ def load_datasets( print("train_dataset_length_raw", len(train_dataset)) print("test_dataset_length_raw", len(test_dataset)) - log_artifact_metadata( + log_metadata( artifact_name="train_dataset", + infer_artifact=True, metadata={"row_count": len(train_dataset)}, ) - log_artifact_metadata( + log_metadata( artifact_name="test_dataset", + infer_artifact=True, metadata={"row_count": len(test_dataset)}, ) @@ -187,8 +189,9 @@ def train_model( warmup_steps=warmup_steps, ) - log_artifact_metadata( + log_metadata( artifact_name="trained_model", + infer_artifact=True, metadata={ "model_path": model_path, "num_epochs": num_epochs, @@ -280,8 +283,9 @@ def evaluate_model( finetuned_similarity=finetuned_avg_sim, ) - log_artifact_metadata( + log_metadata( artifact_name="evaluation_results", + infer_artifact=True, metadata={ "pretrained_average_similarity": { "value": pretrained_avg_sim, diff --git a/llm-complete-guide/steps/generate_questions.py b/llm-complete-guide/steps/generate_questions.py index f6acdb0a..63df84ff 100644 --- a/llm-complete-guide/steps/generate_questions.py +++ b/llm-complete-guide/steps/generate_questions.py @@ -21,7 +21,7 @@ from rich import print from structures import Document from utils.openai_utils import get_openai_api_key -from zenml import log_artifact_metadata, step +from zenml import log_metadata, step from zenml.logger import get_logger logger = get_logger(__name__) @@ -160,8 +160,9 @@ def generate_questions( f"Generated {len(final_df)} questions for {len(documents)} documents." ) - log_artifact_metadata( + log_metadata( artifact_name="generated_questions", + infer_artifact=True, metadata={ "num_documents": len(documents), "num_questions_generated": len(final_df), diff --git a/llm-complete-guide/steps/markdown_loader.py b/llm-complete-guide/steps/markdown_loader.py index 6b2208e0..1838344e 100644 --- a/llm-complete-guide/steps/markdown_loader.py +++ b/llm-complete-guide/steps/markdown_loader.py @@ -18,7 +18,7 @@ import polars as pl from constants import FILES_TO_IGNORE -from zenml import log_artifact_metadata, step +from zenml import log_metadata, step from zenml.logger import get_logger logger = get_logger(__name__) @@ -61,8 +61,9 @@ def load_markdown_files( f"Subfolder '{subfolder}' not found in the cloned repository." ) - log_artifact_metadata( + log_metadata( artifact_name="markdown_files", + infer_artifact=True, metadata={ "num_markdown_files": len(markdown_files), "columns": "filename, page_content", diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py index d9a9bd95..c477f505 100644 --- a/llm-complete-guide/steps/populate_index.py +++ b/llm-complete-guide/steps/populate_index.py @@ -39,7 +39,7 @@ from sentence_transformers import SentenceTransformer from structures import Document from utils.llm_utils import get_db_conn, get_es_client, split_documents -from zenml import ArtifactConfig, log_artifact_metadata, step, log_model_metadata +from zenml import ArtifactConfig, log_metadata, step, log_metadata from zenml.metadata.metadata_types import Uri from zenml.client import Client from constants import SECRET_NAME @@ -515,8 +515,9 @@ def preprocess_documents( Exception: If an error occurs during preprocessing. """ try: - log_artifact_metadata( + log_metadata( artifact_name="split_chunks", + infer_artifact=True, metadata={ "chunk_size": CHUNK_SIZE, "chunk_overlap": CHUNK_OVERLAP, @@ -536,8 +537,9 @@ def preprocess_documents( histogram_chart: Image.Image = create_histogram(stats) bar_chart: Image.Image = create_bar_chart(stats) - log_artifact_metadata( + log_metadata( artifact_name="split_chunks", + infer_artifact=True, metadata=stats, ) @@ -568,8 +570,9 @@ def generate_embeddings( try: model = SentenceTransformer(EMBEDDINGS_MODEL) - log_artifact_metadata( + log_metadata( artifact_name="documents_with_embeddings", + infer_artifact=True, metadata={ "embedding_type": EMBEDDINGS_MODEL, "embedding_dimensionality": EMBEDDING_DIMENSIONALITY, @@ -828,7 +831,8 @@ def _log_metadata(index_type: IndexType) -> None: "dbname": "postgres", } - log_model_metadata( + log_metadata( + infer_model=True, metadata={ "embeddings": { "model": EMBEDDINGS_MODEL, diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index 9c54563b..68421d0c 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -16,7 +16,7 @@ import json from typing_extensions import Annotated -from zenml import ArtifactConfig, log_artifact_metadata, step +from zenml import ArtifactConfig, log_metadata, step from steps.url_scraping_utils import get_all_pages @@ -58,8 +58,9 @@ def url_scraper( # website_urls = get_all_pages(website_url) # all_urls = docs_urls + website_urls + examples_readme_urls all_urls = docs_urls - log_artifact_metadata( + log_metadata( artifact_name="urls", + infer_artifact=True, metadata={ "count": len(all_urls), }, diff --git a/llm-complete-guide/steps/visualize_chat.py b/llm-complete-guide/steps/visualize_chat.py index 61b0f898..b2ec02be 100644 --- a/llm-complete-guide/steps/visualize_chat.py +++ b/llm-complete-guide/steps/visualize_chat.py @@ -1,6 +1,6 @@ from typing import Optional, Dict, Any from typing_extensions import Annotated -from zenml import log_artifact_metadata, pipeline, step +from zenml import log_metadata, pipeline, step from zenml.types import HTMLString @step(enable_cache=False) diff --git a/llm-complete-guide/utils/llm_utils.py b/llm-complete-guide/utils/llm_utils.py index 07516100..31782615 100644 --- a/llm-complete-guide/utils/llm_utils.py +++ b/llm-complete-guide/utils/llm_utils.py @@ -441,7 +441,7 @@ def find_vectorstore_name() -> str: client = Client() model = client.get_model_version(ZENML_CHATBOT_MODEL, model_version_name_or_number_or_id="v0.68.1-dev") - return model.run_metadata["vector_store"].value["name"] + return model.run_metadata["vector_store"]["name"] def rerank_documents(