Merge pull request #81 from zenml-io/feature/stack-showcase

Feature/stack showcase
zenml-io · Dec 7, 2023 · 8f74b2e · 8f74b2e
2 parents 16c49fc + e30d99a
commit 8f74b2e
Show file tree

Hide file tree

Showing 36 changed files with 2,818 additions and 11 deletions.
diff --git a/.typos.toml b/.typos.toml
@@ -3,6 +3,7 @@ extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analy
 
 [default.extend-identifiers]
 #  HashiCorp = "HashiCorp"
+connexion = "connexion"
 
 
 [default.extend-words]

diff --git a/langchain-llamaindex-slackbot/.gitignore b/langchain-llamaindex-slackbot/.gitignore
@@ -129,7 +129,7 @@ dmypy.json
 .pyre/
 
 # Zenml
-.zen/
+src/.zen/
 
 # MLflow
 mlruns/

diff --git a/langchain-llamaindex-slackbot/src/local_testing_slackbot.py b/langchain-llamaindex-slackbot/src/local_testing_slackbot.py
@@ -18,10 +18,15 @@
     get_vector_store,
 )
 from zenml.logger import get_logger
+from zenml.client import Client
+
+SLACK_BOT_TOKEN = (Client().get_secret("langchain_project_secret")
+                           .secret_values["slack_bot_token"])
+SLACK_APP_TOKEN = (Client().get_secret("langchain_project_secret")
+                           .secret_values["slack_app_token"])
+OPENAI_API_KEY = (Client().get_secret("langchain_project_secret")
+                          .secret_values["openai_api_key"])
 
-SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN")
-SLACK_APP_TOKEN = os.getenv("SLACK_APP_TOKEN")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 PIPELINE_NAME = os.getenv("PIPELINE_NAME", "zenml_docs_index_generation")
 
 logger = get_logger(__name__)
@@ -77,7 +82,7 @@ def reply_in_thread(body: dict, say, context):
     thread_ts = event.get("thread_ts", None) or event["ts"]
 
     if context["bot_user_id"] in event["text"]:
-        logger.debug(f"Received message: {event['text']}")
+        logger.info(f"Received message: {event['text']}")
         if event.get("thread_ts", None):
             full_thread = [
                 f"{msg['text']}"
@@ -107,6 +112,7 @@ def reply_in_thread(body: dict, say, context):
                 question=event["text"],
                 verbose=True,
             )
+        logger.info(output)
         say(text=output, thread_ts=thread_ts)
 
 

diff --git a/langchain-llamaindex-slackbot/src/pipelines/index_builder.py b/langchain-llamaindex-slackbot/src/pipelines/index_builder.py
@@ -11,17 +11,35 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 #  or implied. See the License for the specific language governing
 #  permissions and limitations under the License.
-
+import os
 
 from steps.index_generator import index_generator
 from steps.url_scraper import url_scraper
 from steps.web_url_loader import web_url_loader
 from zenml import pipeline
+from zenml.config import DockerSettings
+from zenml.config.docker_settings import SourceFileMode
 
 pipeline_name = "zenml_docs_index_generation"
+docker_settings = DockerSettings(
+    requirements=[
+        "langchain==0.0.263",
+        "openai==0.27.2",
+        "slack-bolt==1.16.2",
+        "slack-sdk==3.20.0",
+        "fastapi",
+        "flask",
+        "uvicorn",
+        "gcsfs==2023.5.0",
+        "faiss-cpu==1.7.3",
+        "unstructured==0.5.7",
+        "tiktoken",
+        "bs4"
+    ],
+    source_files=SourceFileMode.DOWNLOAD
+)
 
-
-@pipeline(name=pipeline_name)
+@pipeline(name=pipeline_name, settings={"docker": docker_settings})
 def docs_to_index_pipeline(
     docs_url: str = "",
     repo_url: str = "",

diff --git a/langchain-llamaindex-slackbot/src/requirements-slackbot.txt b/langchain-llamaindex-slackbot/src/requirements-slackbot.txt
@@ -2,7 +2,7 @@ langchain==0.0.263
 openai==0.27.2
 slack-bolt==1.16.2
 slack-sdk==3.20.0
-zenml[connectors-gcp]==0.45.3
+zenml[connectors-gcp]==0.45.5
 fastapi
 flask
 uvicorn

diff --git a/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt b/langchain-llamaindex-slackbot/src/requirements-zenml-io-qa.txt
@@ -2,7 +2,7 @@ langchain>=0.0.125,<=0.0.263
 openai>=0.27.2,<=0.27.8
 slack-bolt==1.16.2
 slack-sdk==3.20.0
-zenml==0.44.1
+zenml==0.45.6
 fastapi
 flask
 uvicorn
@@ -11,3 +11,4 @@ faiss-cpu>=1.7.3,<=1.7.4
 unstructured>=0.5.7,<=0.7.8
 lanarky==0.7.12
 tiktoken
+bs4
diff --git a/langchain-llamaindex-slackbot/src/steps/index_generator.py b/langchain-llamaindex-slackbot/src/steps/index_generator.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 #  or implied. See the License for the specific language governing
 #  permissions and limitations under the License.
+import os
 
 from typing import List
 
@@ -21,10 +22,12 @@
 )
 from langchain.vectorstores import FAISS, VectorStore
 from zenml import step
+from zenml.client import Client
 
 
 @step(enable_cache=False)
 def index_generator(documents: List[Document]) -> VectorStore:
+    os.environ["OPENAI_API_KEY"] = Client().get_secret("langchain_project_secret").secret_values["openai_api_key"]
     embeddings = OpenAIEmbeddings()
 
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

diff --git a/langchain-llamaindex-slackbot/src/steps/url_scraper.py b/langchain-llamaindex-slackbot/src/steps/url_scraper.py
@@ -16,6 +16,7 @@
 
 from steps.url_scraping_utils import get_all_pages
 from zenml import step
+from zenml.client import Client
 
 
 @step(enable_cache=True)
@@ -36,5 +37,4 @@ def url_scraper(
     Returns:
         List of URLs to scrape.
     """
-    # examples_readme_urls = get_nested_readme_urls(repo_url)
     return get_all_pages(docs_url)
diff --git a/stack-showcase/.dockerignore b/stack-showcase/.dockerignore
@@ -0,0 +1,2 @@
+.venv*
+.requirements*
diff --git a/stack-showcase/README.md b/stack-showcase/README.md
@@ -0,0 +1,53 @@
+# 📜 ZenML Stack Show Case
+
+This project aims to demonstrate the power of stacks. The code in this 
+project assumes that you have quite a few stacks registered already. 
+
+## default
+  * `default` Orchestrator
+  * `default` Artifact Store
+
+```commandline
+zenml stack set default
+python run.py --training-pipeline
+```
+
+## local-sagemaker-step-operator-stack
+  * `default` Orchestrator
+  * `s3` Artifact Store
+  * `local` Image Builder
+  * `aws` Container Registry
+  * `Sagemaker` Step Operator
+
+```commandline
+zenml stack set local-sagemaker-step-operator-stack
+zenml integration install aws -y
+python run.py --training-pipeline
+```
+
+## sagemaker-airflow-stack
+  * `Airflow` Orchestrator
+  * `s3` Artifact Store
+  * `local` Image Builder
+  * `aws` Container Registry
+  * `Sagemaker` Step Operator
+
+```commandline
+zenml stack set sagemaker-airflow-stack
+zenml integration install airflow -y
+pip install apache-airflow-providers-docker apache-airflow~=2.5.0
+zenml stack up
+python run.py --training-pipeline
+```
+
+## sagemaker-stack
+  * `Sagemaker` Orchestrator
+  * `s3` Artifact Store
+  * `local` Image Builder
+  * `aws` Container Registry
+  * `Sagemaker` Step Operator
+
+```commandline
+zenml stack set sagemaker-stack
+python run.py --training-pipeline
+```
diff --git a/stack-showcase/_assets/airflow_stack.png b/stack-showcase/_assets/airflow_stack.png
diff --git a/stack-showcase/_assets/default_stack.png b/stack-showcase/_assets/default_stack.png
diff --git a/stack-showcase/_assets/local_sagmaker_so_stack.png b/stack-showcase/_assets/local_sagmaker_so_stack.png
diff --git a/stack-showcase/_assets/sagemaker_stack.png b/stack-showcase/_assets/sagemaker_stack.png
diff --git a/stack-showcase/configs/feature_engineering.yaml b/stack-showcase/configs/feature_engineering.yaml
@@ -0,0 +1,12 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+
+# configuration of the Model Control Plane
+model_version:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: Classification of Breast Cancer Dataset.
+  tags: ["classification", "sklearn"]
diff --git a/stack-showcase/configs/inference.yaml b/stack-showcase/configs/inference.yaml
@@ -0,0 +1,13 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+
+# configuration of the Model Control Plane
+model_version:
+  name: breast_cancer_classifier
+  version: production
+  license: Apache 2.0
+  description: Classification of Breast Cancer Dataset.
+  tags: ["classification", "sklearn"]
diff --git a/stack-showcase/configs/training.yaml b/stack-showcase/configs/training.yaml
@@ -0,0 +1,12 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+
+# configuration of the Model Control Plane
+model_version:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: Classification of Breast Cancer Dataset.
+  tags: ["classification", "sklearn"]
diff --git a/stack-showcase/pipelines/__init__.py b/stack-showcase/pipelines/__init__.py
@@ -0,0 +1,5 @@
+# {% include 'template/license_header' %}
+
+from .feature_engineering import feature_engineering
+from .inference import inference
+from .training import training
diff --git a/stack-showcase/pipelines/feature_engineering.py b/stack-showcase/pipelines/feature_engineering.py
@@ -0,0 +1,54 @@
+# {% include 'template/license_header' %}
+
+import random
+from typing import List, Optional
+
+from steps import (
+    data_loader,
+    data_preprocessor,
+    data_splitter,
+)
+from zenml import pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def feature_engineering(
+    test_size: float = 0.2,
+    drop_na: Optional[bool] = None,
+    normalize: Optional[bool] = None,
+    drop_columns: Optional[List[str]] = None,
+    target: Optional[str] = "target",
+):
+    """
+    Feature engineering pipeline.
+
+    This is a pipeline that loads the data, processes it and splits
+    it into train and test sets.
+
+    Args:
+        test_size: Size of holdout set for training 0.0..1.0
+        drop_na: If `True` NA values will be removed from dataset
+        normalize: If `True` dataset will be normalized with MinMaxScaler
+        drop_columns: List of columns to drop from dataset
+        target: Name of target column in dataset
+    """
+    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+    raw_data = data_loader(random_state=random.randint(0, 100), target=target)
+    dataset_trn, dataset_tst = data_splitter(
+        dataset=raw_data,
+        test_size=test_size,
+    )
+    dataset_trn, dataset_tst, _ = data_preprocessor(
+        dataset_trn=dataset_trn,
+        dataset_tst=dataset_tst,
+        drop_na=drop_na,
+        normalize=normalize,
+        drop_columns=drop_columns,
+        target=target,
+    )
+    return dataset_trn, dataset_tst
diff --git a/stack-showcase/pipelines/inference.py b/stack-showcase/pipelines/inference.py
@@ -0,0 +1,52 @@
+# {% include 'template/license_header' %}
+
+from typing import List, Optional
+
+from steps import (
+    data_loader,
+    inference_preprocessor,
+    inference_predict,
+)
+from zenml import pipeline, ExternalArtifact
+from zenml.client import Client
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def inference(
+    test_size: float = 0.2,
+    drop_na: Optional[bool] = None,
+    normalize: Optional[bool] = None,
+    drop_columns: Optional[List[str]] = None,
+):
+    """
+    Model training pipeline.
+
+    This is a pipeline that loads the data, processes it and splits
+    it into train and test sets, then search for best hyperparameters,
+    trains and evaluates a model.
+
+    Args:
+        test_size: Size of holdout set for training 0.0..1.0
+        drop_na: If `True` NA values will be removed from dataset
+        normalize: If `True` dataset will be normalized with MinMaxScaler
+        drop_columns: List of columns to drop from dataset
+    """
+    ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+    client = Client()
+    random_state = client.get_artifact("dataset").run_metadata["random_state"].value
+    target = "target"
+    df_inference = data_loader(random_state=random_state, is_inference=True)
+    df_inference = inference_preprocessor(
+        dataset_inf=df_inference,
+        preprocess_pipeline=ExternalArtifact(name="preprocess_pipeline"),
+        target=target,
+    )
+    inference_predict(
+        dataset_inf=df_inference,
+    )
+    ### END CODE HERE ###
-Original file line number
+Diff line change
@@ Expand Up / @@ -129,7 +129,7 @@ dmypy.json @@
     .pyre/
     # Zenml
-    .zen/
+    src/.zen/
     # MLflow
     mlruns/
@@ Expand Down @@