Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/stack showcase #81

Merged
merged 12 commits into from
Dec 7, 2023
1 change: 1 addition & 0 deletions .typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analy

[default.extend-identifiers]
# HashiCorp = "HashiCorp"
connexion = "connexion"


[default.extend-words]
Expand Down
2 changes: 1 addition & 1 deletion langchain-llamaindex-slackbot/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ dmypy.json
.pyre/

# Zenml
.zen/
src/.zen/

# MLflow
mlruns/
Expand Down
14 changes: 10 additions & 4 deletions langchain-llamaindex-slackbot/src/local_testing_slackbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@
get_vector_store,
)
from zenml.logger import get_logger
from zenml.client import Client

SLACK_BOT_TOKEN = (Client().get_secret("langchain_project_secret")
.secret_values["slack_bot_token"])
SLACK_APP_TOKEN = (Client().get_secret("langchain_project_secret")
.secret_values["slack_app_token"])
OPENAI_API_KEY = (Client().get_secret("langchain_project_secret")
.secret_values["openai_api_key"])

SLACK_BOT_TOKEN = os.getenv("SLACK_BOT_TOKEN")
SLACK_APP_TOKEN = os.getenv("SLACK_APP_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PIPELINE_NAME = os.getenv("PIPELINE_NAME", "zenml_docs_index_generation")

logger = get_logger(__name__)
Expand Down Expand Up @@ -77,7 +82,7 @@ def reply_in_thread(body: dict, say, context):
thread_ts = event.get("thread_ts", None) or event["ts"]

if context["bot_user_id"] in event["text"]:
logger.debug(f"Received message: {event['text']}")
logger.info(f"Received message: {event['text']}")
if event.get("thread_ts", None):
full_thread = [
f"{msg['text']}"
Expand Down Expand Up @@ -107,6 +112,7 @@ def reply_in_thread(body: dict, say, context):
question=event["text"],
verbose=True,
)
logger.info(output)
say(text=output, thread_ts=thread_ts)


Expand Down
24 changes: 21 additions & 3 deletions langchain-llamaindex-slackbot/src/pipelines/index_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.

import os

from steps.index_generator import index_generator
from steps.url_scraper import url_scraper
from steps.web_url_loader import web_url_loader
from zenml import pipeline
from zenml.config import DockerSettings
from zenml.config.docker_settings import SourceFileMode

pipeline_name = "zenml_docs_index_generation"
docker_settings = DockerSettings(
requirements=[
"langchain==0.0.263",
"openai==0.27.2",
"slack-bolt==1.16.2",
"slack-sdk==3.20.0",
"fastapi",
"flask",
"uvicorn",
"gcsfs==2023.5.0",
"faiss-cpu==1.7.3",
"unstructured==0.5.7",
"tiktoken",
"bs4"
],
source_files=SourceFileMode.DOWNLOAD
)


@pipeline(name=pipeline_name)
@pipeline(name=pipeline_name, settings={"docker": docker_settings})
def docs_to_index_pipeline(
docs_url: str = "",
repo_url: str = "",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ langchain==0.0.263
openai==0.27.2
slack-bolt==1.16.2
slack-sdk==3.20.0
zenml[connectors-gcp]==0.45.3
zenml[connectors-gcp]==0.45.5
fastapi
flask
uvicorn
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ langchain>=0.0.125,<=0.0.263
openai>=0.27.2,<=0.27.8
slack-bolt==1.16.2
slack-sdk==3.20.0
zenml==0.44.1
zenml==0.45.6
fastapi
flask
uvicorn
Expand All @@ -11,3 +11,4 @@ faiss-cpu>=1.7.3,<=1.7.4
unstructured>=0.5.7,<=0.7.8
lanarky==0.7.12
tiktoken
bs4
3 changes: 3 additions & 0 deletions langchain-llamaindex-slackbot/src/steps/index_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
import os

from typing import List

Expand All @@ -21,10 +22,12 @@
)
from langchain.vectorstores import FAISS, VectorStore
from zenml import step
from zenml.client import Client


@step(enable_cache=False)
def index_generator(documents: List[Document]) -> VectorStore:
os.environ["OPENAI_API_KEY"] = Client().get_secret("langchain_project_secret").secret_values["openai_api_key"]
embeddings = OpenAIEmbeddings()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
Expand Down
2 changes: 1 addition & 1 deletion langchain-llamaindex-slackbot/src/steps/url_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from steps.url_scraping_utils import get_all_pages
from zenml import step
from zenml.client import Client


@step(enable_cache=True)
Expand All @@ -36,5 +37,4 @@ def url_scraper(
Returns:
List of URLs to scrape.
"""
# examples_readme_urls = get_nested_readme_urls(repo_url)
return get_all_pages(docs_url)
2 changes: 2 additions & 0 deletions stack-showcase/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv*
.requirements*
53 changes: 53 additions & 0 deletions stack-showcase/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# 📜 ZenML Stack Show Case

This project aims to demonstrate the power of stacks. The code in this
project assumes that you have quite a few stacks registered already.

## default
* `default` Orchestrator
* `default` Artifact Store

```commandline
zenml stack set default
python run.py --training-pipeline
```

## local-sagemaker-step-operator-stack
* `default` Orchestrator
* `s3` Artifact Store
* `local` Image Builder
* `aws` Container Registry
* `Sagemaker` Step Operator

```commandline
zenml stack set local-sagemaker-step-operator-stack
zenml integration install aws -y
python run.py --training-pipeline
```

## sagemaker-airflow-stack
* `Airflow` Orchestrator
* `s3` Artifact Store
* `local` Image Builder
* `aws` Container Registry
* `Sagemaker` Step Operator

```commandline
zenml stack set sagemaker-airflow-stack
zenml integration install airflow -y
pip install apache-airflow-providers-docker apache-airflow~=2.5.0
zenml stack up
python run.py --training-pipeline
```

## sagemaker-stack
* `Sagemaker` Orchestrator
* `s3` Artifact Store
* `local` Image Builder
* `aws` Container Registry
* `Sagemaker` Step Operator

```commandline
zenml stack set sagemaker-stack
python run.py --training-pipeline
```
Binary file added stack-showcase/_assets/airflow_stack.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added stack-showcase/_assets/default_stack.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added stack-showcase/_assets/sagemaker_stack.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions stack-showcase/configs/feature_engineering.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# environment configuration
settings:
docker:
required_integrations:
- sklearn

# configuration of the Model Control Plane
model_version:
name: breast_cancer_classifier
license: Apache 2.0
description: Classification of Breast Cancer Dataset.
tags: ["classification", "sklearn"]
13 changes: 13 additions & 0 deletions stack-showcase/configs/inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# environment configuration
settings:
docker:
required_integrations:
- sklearn

# configuration of the Model Control Plane
model_version:
name: breast_cancer_classifier
version: production
license: Apache 2.0
description: Classification of Breast Cancer Dataset.
tags: ["classification", "sklearn"]
12 changes: 12 additions & 0 deletions stack-showcase/configs/training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# environment configuration
settings:
docker:
required_integrations:
- sklearn

# configuration of the Model Control Plane
model_version:
name: breast_cancer_classifier
license: Apache 2.0
description: Classification of Breast Cancer Dataset.
tags: ["classification", "sklearn"]
5 changes: 5 additions & 0 deletions stack-showcase/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# {% include 'template/license_header' %}

from .feature_engineering import feature_engineering
from .inference import inference
from .training import training
54 changes: 54 additions & 0 deletions stack-showcase/pipelines/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# {% include 'template/license_header' %}

import random
from typing import List, Optional

from steps import (
data_loader,
data_preprocessor,
data_splitter,
)
from zenml import pipeline
from zenml.logger import get_logger

logger = get_logger(__name__)


@pipeline
def feature_engineering(
test_size: float = 0.2,
drop_na: Optional[bool] = None,
normalize: Optional[bool] = None,
drop_columns: Optional[List[str]] = None,
target: Optional[str] = "target",
):
"""
Feature engineering pipeline.

This is a pipeline that loads the data, processes it and splits
it into train and test sets.

Args:
test_size: Size of holdout set for training 0.0..1.0
drop_na: If `True` NA values will be removed from dataset
normalize: If `True` dataset will be normalized with MinMaxScaler
drop_columns: List of columns to drop from dataset
target: Name of target column in dataset
"""
### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
# Link all the steps together by calling them and passing the output
# of one step as the input of the next step.
raw_data = data_loader(random_state=random.randint(0, 100), target=target)
dataset_trn, dataset_tst = data_splitter(
dataset=raw_data,
test_size=test_size,
)
dataset_trn, dataset_tst, _ = data_preprocessor(
dataset_trn=dataset_trn,
dataset_tst=dataset_tst,
drop_na=drop_na,
normalize=normalize,
drop_columns=drop_columns,
target=target,
)
return dataset_trn, dataset_tst
52 changes: 52 additions & 0 deletions stack-showcase/pipelines/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# {% include 'template/license_header' %}

from typing import List, Optional

from steps import (
data_loader,
inference_preprocessor,
inference_predict,
)
from zenml import pipeline, ExternalArtifact
from zenml.client import Client
from zenml.logger import get_logger

logger = get_logger(__name__)


@pipeline
def inference(
test_size: float = 0.2,
drop_na: Optional[bool] = None,
normalize: Optional[bool] = None,
drop_columns: Optional[List[str]] = None,
):
"""
Model training pipeline.

This is a pipeline that loads the data, processes it and splits
it into train and test sets, then search for best hyperparameters,
trains and evaluates a model.

Args:
test_size: Size of holdout set for training 0.0..1.0
drop_na: If `True` NA values will be removed from dataset
normalize: If `True` dataset will be normalized with MinMaxScaler
drop_columns: List of columns to drop from dataset
"""
### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
# Link all the steps together by calling them and passing the output
# of one step as the input of the next step.
client = Client()
random_state = client.get_artifact("dataset").run_metadata["random_state"].value
target = "target"
df_inference = data_loader(random_state=random_state, is_inference=True)
df_inference = inference_preprocessor(
dataset_inf=df_inference,
preprocess_pipeline=ExternalArtifact(name="preprocess_pipeline"),
target=target,
)
inference_predict(
dataset_inf=df_inference,
)
### END CODE HERE ###
Loading
Loading