Skip to content

Commit

Permalink
Merge branch 'main' into on/doc-problem-types
Browse files Browse the repository at this point in the history
  • Loading branch information
sherenem authored Sep 27, 2024
2 parents bd6ec02 + 8790178 commit f952547
Show file tree
Hide file tree
Showing 13 changed files with 691 additions and 679 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ python_version = "3.10"
[packages]
torch = {index = "pytorch", version = "==2.4.0+cu121"}
tqdm = ">=4.65.0, <5.0.0"
transformers = "==4.44.2"
transformers = "==4.45.0"
numpy = ">=1.26.0, <2.0.0"
pandas = ">=2.2.0, <3.0.0"
scikit-learn = ">=1.5.1, <2.0.0"
Expand Down
1,161 changes: 589 additions & 572 deletions Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ If you are running H2O LLM Studio with a custom environment other than Pipenv, y
H2O_WAVE_MAX_REQUEST_SIZE=25MB \
H2O_WAVE_NO_LOG=true \
H2O_WAVE_PRIVATE_DIR="/download/@output/download" \
wave run app
wave run llm_studio.app
```

If you are using the [nightly conda environment](#nightly-conda-virtual-environment), you can run ```make llmstudio-conda```.
Expand Down
6 changes: 4 additions & 2 deletions app.toml.template
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@ CPUReservation = "10"
MemoryReservation = "118Gi"
MemoryLimit = "118Gi"
GPUCount = 1
GPUType = "training-gpu"
VolumeMount = "/home/llmstudio/mount"
VolumeSize = "1Ti"
ResourceVolumeSize = "1Ti"
EnableSHM = true
EnableOIDC = true
RoutingMode = "BASE_URL"
CustomImage = "docker.io/h2oairelease/h2oai-llmstudio-app:v{{VERSION}}"

[[Env]]
Name = "H2O_LLM_STUDIO_WORKDIR"
Expand All @@ -39,3 +37,7 @@ Value = "True"
[[Env]]
Name = "H2O_WAVE_PRIVATE_DIR"
Value = "/download/@/home/llmstudio/mount/output/download"

[[Env]]
Name = "HF_HUB_ENABLE_HF_TRANSFER"
Value = "0"
4 changes: 2 additions & 2 deletions documentation/docs/get-started/set-up-llm-studio.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ H2O LLM Studio requires the following minimum requirements:
H2O_WAVE_MAX_REQUEST_SIZE=25MB \ <br></br>
H2O_WAVE_NO_LOG=True \ <br></br>
H2O_WAVE_PRIVATE_DIR="/download/@output/download" \ <br></br>
wave run app
wave run llm_studio.app
</code>
</pre>
</p>
Expand Down Expand Up @@ -255,7 +255,7 @@ If you are running H2O LLM Studio with a custom environment other than Pipenv, s
H2O_WAVE_MAX_REQUEST_SIZE=25MB \
H2O_WAVE_NO_LOG=True \
H2O_WAVE_PRIVATE_DIR="/download/@output/download" \
wave run app
wave run llm_studio.app
```

### Run using Docker from a nightly build
Expand Down
48 changes: 46 additions & 2 deletions llm_studio/app_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from types import SimpleNamespace

import toml
from huggingface_hub.constants import _is_true

toml_root_dir = os.path.abspath(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..")
Expand Down Expand Up @@ -43,6 +44,47 @@ def get_size(x):
url = f"http://{host}:{port}/"


if os.getenv("H2O_LLM_STUDIO_DEFAULT_LM_MODELS"):
default_causal_language_models = [
mdl.strip() for mdl in os.getenv("H2O_LLM_STUDIO_DEFAULT_LM_MODELS").split(",")
]
else:
default_causal_language_models = [
"h2oai/h2o-danube3-500m-base",
"h2oai/h2o-danube3-500m-chat",
"h2oai/h2o-danube3-4b-base",
"h2oai/h2o-danube3-4b-chat",
"h2oai/h2o-danube2-1.8b-base",
"h2oai/h2o-danube2-1.8b-chat",
"meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-3B-Instruct",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-v0.3",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-72B-Instruct",
]

if os.getenv("H2O_LLM_STUDIO_DEFAULT_S2S_MODELS"):
default_sequence_to_sequence_models = [
mdl.strip() for mdl in os.getenv("H2O_LLM_STUDIO_DEFAULT_S2S_MODELS").split(",")
]
else:
default_sequence_to_sequence_models = [
"t5-small",
"t5-base",
"t5-large",
"google/flan-t5-small",
"google/flan-t5-base",
"google/flan-t5-large",
"google/flan-ul2",
]

default_cfg = {
"url": url,
"name": "H2O LLM Studio",
Expand All @@ -67,6 +109,8 @@ def get_size(x):
"text_sequence_to_sequence_modeling_config",
"text_dpo_modeling_config",
],
"default_causal_language_models": default_causal_language_models,
"default_sequence_to_sequence_models": default_sequence_to_sequence_models,
"problem_categories": ["text"],
"dataset_keys": [
"train_dataframe",
Expand Down Expand Up @@ -117,8 +161,8 @@ def get_size(x):
"default_wandb_project": os.getenv("WANDB_PROJECT", ""),
"default_wandb_entity": os.getenv("WANDB_ENTITY", ""),
"default_huggingface_api_token": os.getenv("HF_TOKEN", ""),
"default_hf_hub_enable_hf_transfer": os.getenv(
"HF_HUB_ENABLE_HF_TRANSFER", True
"default_hf_hub_enable_hf_transfer": _is_true(
os.getenv("HF_HUB_ENABLE_HF_TRANSFER", "1")
),
"default_openai_azure": os.getenv("OPENAI_API_TYPE", "open_ai") == "azure",
"default_openai_api_token": os.getenv("OPENAI_API_KEY", ""),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import llm_studio.src.datasets.text_causal_classification_ds
import llm_studio.src.plots.text_causal_classification_modeling_plots
from llm_studio.app_utils.config import default_cfg
from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase
from llm_studio.python_configs.text_causal_language_modeling_config import (
ConfigNLPAugmentation,
Expand Down Expand Up @@ -155,7 +156,11 @@ class ConfigNLPCausalClassificationLogging(ConfigNLPCausalLMLogging):
class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)
llm_backbone: str = "h2oai/h2o-danube3-500m-chat"
llm_backbone: str = (
"h2oai/h2o-danube3-500m-chat"
if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models
else default_cfg.default_causal_language_models[0]
)

dataset: ConfigNLPCausalClassificationDataset = field(
default_factory=ConfigNLPCausalClassificationDataset
Expand Down Expand Up @@ -188,24 +193,7 @@ def __post_init__(self):
self._visibility["output_directory"] = -1

self._possible_values["llm_backbone"] = possible_values.String(
values=(
"h2oai/h2o-danube3-500m-base",
"h2oai/h2o-danube3-500m-chat",
"h2oai/h2o-danube3-4b-base",
"h2oai/h2o-danube3-4b-chat",
"h2oai/h2o-danube2-1.8b-base",
"h2oai/h2o-danube2-1.8b-chat",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-v0.3",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-72B-Instruct",
),
values=default_cfg.default_causal_language_models,
allow_custom=True,
)

Expand Down
26 changes: 7 additions & 19 deletions llm_studio/python_configs/text_causal_language_modeling_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import torch

import llm_studio.src.datasets.text_causal_language_modeling_ds
from llm_studio.app_utils.config import default_cfg
from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase
from llm_studio.src import possible_values
from llm_studio.src.augmentations.nlp_aug import BaseNLPAug
Expand Down Expand Up @@ -613,7 +614,11 @@ def __post_init__(self):
class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)
llm_backbone: str = "h2oai/h2o-danube3-500m-base"
llm_backbone: str = (
"h2oai/h2o-danube3-500m-base"
if "h2oai/h2o-danube3-500m-base" in default_cfg.default_causal_language_models
else default_cfg.default_causal_language_models[0]
)

dataset: ConfigNLPCausalLMDataset = field(default_factory=ConfigNLPCausalLMDataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand All @@ -640,24 +645,7 @@ def __post_init__(self):
self._visibility["output_directory"] = -1

self._possible_values["llm_backbone"] = possible_values.String(
values=(
"h2oai/h2o-danube3-500m-base",
"h2oai/h2o-danube3-500m-chat",
"h2oai/h2o-danube3-4b-base",
"h2oai/h2o-danube3-4b-chat",
"h2oai/h2o-danube2-1.8b-base",
"h2oai/h2o-danube2-1.8b-chat",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-v0.3",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-72B-Instruct",
),
values=default_cfg.default_causal_language_models,
allow_custom=True,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import llm_studio.src.datasets.text_causal_regression_ds
import llm_studio.src.plots.text_causal_classification_modeling_plots
from llm_studio.app_utils.config import default_cfg
from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase
from llm_studio.python_configs.text_causal_classification_modeling_config import (
ConfigNLPCausalClassificationAugmentation as ConfigNLPCausalRegressionAugmentation,
Expand Down Expand Up @@ -109,7 +110,11 @@ def __post_init__(self):
class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)
llm_backbone: str = "h2oai/h2o-danube3-500m-chat"
llm_backbone: str = (
"h2oai/h2o-danube3-500m-chat"
if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models
else default_cfg.default_causal_language_models[0]
)

dataset: ConfigNLPCausalRegressionDataset = field(
default_factory=ConfigNLPCausalRegressionDataset
Expand Down Expand Up @@ -142,24 +147,7 @@ def __post_init__(self):
self._visibility["output_directory"] = -1

self._possible_values["llm_backbone"] = possible_values.String(
values=(
"h2oai/h2o-danube3-500m-base",
"h2oai/h2o-danube3-500m-chat",
"h2oai/h2o-danube3-4b-base",
"h2oai/h2o-danube3-4b-chat",
"h2oai/h2o-danube2-1.8b-base",
"h2oai/h2o-danube2-1.8b-chat",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-v0.3",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-72B-Instruct",
),
values=default_cfg.default_causal_language_models,
allow_custom=True,
)

Expand Down
26 changes: 7 additions & 19 deletions llm_studio/python_configs/text_dpo_modeling_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any

import llm_studio.src.datasets.text_dpo_modeling_ds
from llm_studio.app_utils.config import default_cfg
from llm_studio.python_configs.base import DefaultConfigProblemBase
from llm_studio.python_configs.text_causal_language_modeling_config import (
ConfigNLPAugmentation,
Expand Down Expand Up @@ -108,7 +109,11 @@ class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)

llm_backbone: str = "h2oai/h2o-danube3-500m-chat"
llm_backbone: str = (
"h2oai/h2o-danube3-500m-chat"
if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models
else default_cfg.default_causal_language_models[0]
)

dataset: ConfigDPODataset = field(default_factory=ConfigDPODataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand All @@ -129,23 +134,6 @@ def __post_init__(self):
super().__post_init__()
self._visibility["output_directory"] = -1
self._possible_values["llm_backbone"] = possible_values.String(
values=(
"h2oai/h2o-danube3-500m-base",
"h2oai/h2o-danube3-500m-chat",
"h2oai/h2o-danube3-4b-base",
"h2oai/h2o-danube3-4b-chat",
"h2oai/h2o-danube2-1.8b-base",
"h2oai/h2o-danube2-1.8b-chat",
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"meta-llama/Meta-Llama-3.1-70B-Instruct",
"mistralai/Mistral-7B-v0.3",
"mistralai/Mistral-7B-Instruct-v0.2",
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"microsoft/Phi-3-mini-4k-instruct",
"microsoft/Phi-3-medium-4k-instruct",
"Qwen/Qwen2-7B-Instruct",
"Qwen/Qwen2-72B-Instruct",
),
values=default_cfg.default_causal_language_models,
allow_custom=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List

from llm_studio.app_utils.config import default_cfg
from llm_studio.python_configs.base import DefaultConfigProblemBase
from llm_studio.python_configs.text_causal_language_modeling_config import (
ConfigNLPAugmentation,
Expand Down Expand Up @@ -72,7 +73,11 @@ def __post_init__(self):
class ConfigProblemBase(DefaultConfigProblemBase):
output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}"
experiment_name: str = field(default_factory=generate_experiment_name)
llm_backbone: str = "t5-small"
llm_backbone: str = (
"t5-small"
if "t5-small" in default_cfg.default_sequence_to_sequence_models
else default_cfg.default_sequence_to_sequence_models[0]
)

dataset: ConfigNLPSeq2SeqDataset = field(default_factory=ConfigNLPSeq2SeqDataset)
tokenizer: ConfigNLPCausalLMTokenizer = field(
Expand All @@ -99,15 +104,7 @@ def __post_init__(self):
self._visibility["output_directory"] = -1

self._possible_values["llm_backbone"] = possible_values.String(
values=(
"t5-small",
"t5-base",
"t5-large",
"google/flan-t5-small",
"google/flan-t5-base",
"google/flan-t5-large",
"google/flan-ul2",
),
values=default_cfg.default_sequence_to_sequence_models,
allow_custom=True,
)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "h2o-llmstudio"
version = "1.13.0-dev"
version = "1.14.0-dev"
readme = "README.md"
license = {file = "LICENSE"}

Expand Down
Loading

0 comments on commit f952547

Please sign in to comment.