From b230c77d23ccf3786cbcd9a38581781a568bd07f Mon Sep 17 00:00:00 2001 From: Pascal Pfeiffer <1069138+pascal-pfeiffer@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:08:37 +0200 Subject: [PATCH] [FEATURE] Default model list can be configured with ENV vars (#870) * move model list to default_cfg * H2O_LLM_STUDIO_DEFAULT_LM_MODELS & H2O_LLM_STUDIO_DEFAULT_S2S_MODELS comma separated list of model names * format --- llm_studio/app_utils/config.py | 43 +++++++++++++++++++ ...t_causal_classification_modeling_config.py | 28 +++--------- .../text_causal_language_modeling_config.py | 28 +++--------- .../text_causal_regression_modeling_config.py | 28 +++--------- .../text_dpo_modeling_config.py | 28 +++--------- ...xt_sequence_to_sequence_modeling_config.py | 17 +++----- 6 files changed, 78 insertions(+), 94 deletions(-) diff --git a/llm_studio/app_utils/config.py b/llm_studio/app_utils/config.py index 76faa354c..2c1b023a1 100644 --- a/llm_studio/app_utils/config.py +++ b/llm_studio/app_utils/config.py @@ -43,6 +43,47 @@ def get_size(x): url = f"http://{host}:{port}/" +if os.getenv("H2O_LLM_STUDIO_DEFAULT_LM_MODELS"): + default_causal_language_models = [ + mdl.strip() for mdl in os.getenv("H2O_LLM_STUDIO_DEFAULT_LM_MODELS").split(",") + ] +else: + default_causal_language_models = [ + "h2oai/h2o-danube3-500m-base", + "h2oai/h2o-danube3-500m-chat", + "h2oai/h2o-danube3-4b-base", + "h2oai/h2o-danube3-4b-chat", + "h2oai/h2o-danube2-1.8b-base", + "h2oai/h2o-danube2-1.8b-chat", + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "meta-llama/Meta-Llama-3.1-70B-Instruct", + "mistralai/Mistral-7B-v0.3", + "mistralai/Mistral-7B-Instruct-v0.2", + "google/gemma-2-2b-it", + "google/gemma-2-9b-it", + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-medium-4k-instruct", + "Qwen/Qwen2-7B-Instruct", + "Qwen/Qwen2-72B-Instruct", + ] + +if os.getenv("H2O_LLM_STUDIO_DEFAULT_S2S_MODELS"): + default_sequence_to_sequence_models = [ + mdl.strip() for mdl in os.getenv("H2O_LLM_STUDIO_DEFAULT_S2S_MODELS").split(",") + ] +else: + default_sequence_to_sequence_models = [ + "t5-small", + "t5-base", + "t5-large", + "google/flan-t5-small", + "google/flan-t5-base", + "google/flan-t5-large", + "google/flan-ul2", + ] + default_cfg = { "url": url, "name": "H2O LLM Studio", @@ -67,6 +108,8 @@ def get_size(x): "text_sequence_to_sequence_modeling_config", "text_dpo_modeling_config", ], + "default_causal_language_models": default_causal_language_models, + "default_sequence_to_sequence_models": default_sequence_to_sequence_models, "problem_categories": ["text"], "dataset_keys": [ "train_dataframe", diff --git a/llm_studio/python_configs/text_causal_classification_modeling_config.py b/llm_studio/python_configs/text_causal_classification_modeling_config.py index 6c0ce3d8e..e2f5b7fdc 100644 --- a/llm_studio/python_configs/text_causal_classification_modeling_config.py +++ b/llm_studio/python_configs/text_causal_classification_modeling_config.py @@ -4,6 +4,7 @@ import llm_studio.src.datasets.text_causal_classification_ds import llm_studio.src.plots.text_causal_classification_modeling_plots +from llm_studio.app_utils.config import default_cfg from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase from llm_studio.python_configs.text_causal_language_modeling_config import ( ConfigNLPAugmentation, @@ -155,7 +156,11 @@ class ConfigNLPCausalClassificationLogging(ConfigNLPCausalLMLogging): class ConfigProblemBase(DefaultConfigProblemBase): output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}" experiment_name: str = field(default_factory=generate_experiment_name) - llm_backbone: str = "h2oai/h2o-danube3-500m-chat" + llm_backbone: str = ( + "h2oai/h2o-danube3-500m-chat" + if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models + else default_cfg.default_causal_language_models[0] + ) dataset: ConfigNLPCausalClassificationDataset = field( default_factory=ConfigNLPCausalClassificationDataset @@ -188,26 +193,7 @@ def __post_init__(self): self._visibility["output_directory"] = -1 self._possible_values["llm_backbone"] = possible_values.String( - values=( - "h2oai/h2o-danube3-500m-base", - "h2oai/h2o-danube3-500m-chat", - "h2oai/h2o-danube3-4b-base", - "h2oai/h2o-danube3-4b-chat", - "h2oai/h2o-danube2-1.8b-base", - "h2oai/h2o-danube2-1.8b-chat", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-3B-Instruct", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "meta-llama/Meta-Llama-3.1-70B-Instruct", - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-Instruct-v0.2", - "google/gemma-2-2b-it", - "google/gemma-2-9b-it", - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-medium-4k-instruct", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-72B-Instruct", - ), + values=default_cfg.default_causal_language_models, allow_custom=True, ) diff --git a/llm_studio/python_configs/text_causal_language_modeling_config.py b/llm_studio/python_configs/text_causal_language_modeling_config.py index ec8763432..e74cd42c4 100644 --- a/llm_studio/python_configs/text_causal_language_modeling_config.py +++ b/llm_studio/python_configs/text_causal_language_modeling_config.py @@ -6,6 +6,7 @@ import torch import llm_studio.src.datasets.text_causal_language_modeling_ds +from llm_studio.app_utils.config import default_cfg from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase from llm_studio.src import possible_values from llm_studio.src.augmentations.nlp_aug import BaseNLPAug @@ -613,7 +614,11 @@ def __post_init__(self): class ConfigProblemBase(DefaultConfigProblemBase): output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}" experiment_name: str = field(default_factory=generate_experiment_name) - llm_backbone: str = "h2oai/h2o-danube3-500m-base" + llm_backbone: str = ( + "h2oai/h2o-danube3-500m-base" + if "h2oai/h2o-danube3-500m-base" in default_cfg.default_causal_language_models + else default_cfg.default_causal_language_models[0] + ) dataset: ConfigNLPCausalLMDataset = field(default_factory=ConfigNLPCausalLMDataset) tokenizer: ConfigNLPCausalLMTokenizer = field( @@ -640,26 +645,7 @@ def __post_init__(self): self._visibility["output_directory"] = -1 self._possible_values["llm_backbone"] = possible_values.String( - values=( - "h2oai/h2o-danube3-500m-base", - "h2oai/h2o-danube3-500m-chat", - "h2oai/h2o-danube3-4b-base", - "h2oai/h2o-danube3-4b-chat", - "h2oai/h2o-danube2-1.8b-base", - "h2oai/h2o-danube2-1.8b-chat", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-3B-Instruct", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "meta-llama/Meta-Llama-3.1-70B-Instruct", - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-Instruct-v0.2", - "google/gemma-2-2b-it", - "google/gemma-2-9b-it", - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-medium-4k-instruct", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-72B-Instruct", - ), + values=default_cfg.default_causal_language_models, allow_custom=True, ) diff --git a/llm_studio/python_configs/text_causal_regression_modeling_config.py b/llm_studio/python_configs/text_causal_regression_modeling_config.py index b6cba152b..3f6f448c4 100644 --- a/llm_studio/python_configs/text_causal_regression_modeling_config.py +++ b/llm_studio/python_configs/text_causal_regression_modeling_config.py @@ -4,6 +4,7 @@ import llm_studio.src.datasets.text_causal_regression_ds import llm_studio.src.plots.text_causal_classification_modeling_plots +from llm_studio.app_utils.config import default_cfg from llm_studio.python_configs.base import DefaultConfig, DefaultConfigProblemBase from llm_studio.python_configs.text_causal_classification_modeling_config import ( ConfigNLPCausalClassificationAugmentation as ConfigNLPCausalRegressionAugmentation, @@ -109,7 +110,11 @@ def __post_init__(self): class ConfigProblemBase(DefaultConfigProblemBase): output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}" experiment_name: str = field(default_factory=generate_experiment_name) - llm_backbone: str = "h2oai/h2o-danube3-500m-chat" + llm_backbone: str = ( + "h2oai/h2o-danube3-500m-chat" + if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models + else default_cfg.default_causal_language_models[0] + ) dataset: ConfigNLPCausalRegressionDataset = field( default_factory=ConfigNLPCausalRegressionDataset @@ -142,26 +147,7 @@ def __post_init__(self): self._visibility["output_directory"] = -1 self._possible_values["llm_backbone"] = possible_values.String( - values=( - "h2oai/h2o-danube3-500m-base", - "h2oai/h2o-danube3-500m-chat", - "h2oai/h2o-danube3-4b-base", - "h2oai/h2o-danube3-4b-chat", - "h2oai/h2o-danube2-1.8b-base", - "h2oai/h2o-danube2-1.8b-chat", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-3B-Instruct", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "meta-llama/Meta-Llama-3.1-70B-Instruct", - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-Instruct-v0.2", - "google/gemma-2-2b-it", - "google/gemma-2-9b-it", - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-medium-4k-instruct", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-72B-Instruct", - ), + values=default_cfg.default_causal_language_models, allow_custom=True, ) diff --git a/llm_studio/python_configs/text_dpo_modeling_config.py b/llm_studio/python_configs/text_dpo_modeling_config.py index 44f2a7f65..06cc7c5d0 100644 --- a/llm_studio/python_configs/text_dpo_modeling_config.py +++ b/llm_studio/python_configs/text_dpo_modeling_config.py @@ -3,6 +3,7 @@ from typing import Any import llm_studio.src.datasets.text_dpo_modeling_ds +from llm_studio.app_utils.config import default_cfg from llm_studio.python_configs.base import DefaultConfigProblemBase from llm_studio.python_configs.text_causal_language_modeling_config import ( ConfigNLPAugmentation, @@ -108,7 +109,11 @@ class ConfigProblemBase(DefaultConfigProblemBase): output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}" experiment_name: str = field(default_factory=generate_experiment_name) - llm_backbone: str = "h2oai/h2o-danube3-500m-chat" + llm_backbone: str = ( + "h2oai/h2o-danube3-500m-chat" + if "h2oai/h2o-danube3-500m-chat" in default_cfg.default_causal_language_models + else default_cfg.default_causal_language_models[0] + ) dataset: ConfigDPODataset = field(default_factory=ConfigDPODataset) tokenizer: ConfigNLPCausalLMTokenizer = field( @@ -129,25 +134,6 @@ def __post_init__(self): super().__post_init__() self._visibility["output_directory"] = -1 self._possible_values["llm_backbone"] = possible_values.String( - values=( - "h2oai/h2o-danube3-500m-base", - "h2oai/h2o-danube3-500m-chat", - "h2oai/h2o-danube3-4b-base", - "h2oai/h2o-danube3-4b-chat", - "h2oai/h2o-danube2-1.8b-base", - "h2oai/h2o-danube2-1.8b-chat", - "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-3B-Instruct", - "meta-llama/Meta-Llama-3.1-8B-Instruct", - "meta-llama/Meta-Llama-3.1-70B-Instruct", - "mistralai/Mistral-7B-v0.3", - "mistralai/Mistral-7B-Instruct-v0.2", - "google/gemma-2-2b-it", - "google/gemma-2-9b-it", - "microsoft/Phi-3-mini-4k-instruct", - "microsoft/Phi-3-medium-4k-instruct", - "Qwen/Qwen2-7B-Instruct", - "Qwen/Qwen2-72B-Instruct", - ), + values=default_cfg.default_causal_language_models, allow_custom=True, ) diff --git a/llm_studio/python_configs/text_sequence_to_sequence_modeling_config.py b/llm_studio/python_configs/text_sequence_to_sequence_modeling_config.py index e72b017e1..fc7e18723 100644 --- a/llm_studio/python_configs/text_sequence_to_sequence_modeling_config.py +++ b/llm_studio/python_configs/text_sequence_to_sequence_modeling_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import Any, Dict, List +from llm_studio.app_utils.config import default_cfg from llm_studio.python_configs.base import DefaultConfigProblemBase from llm_studio.python_configs.text_causal_language_modeling_config import ( ConfigNLPAugmentation, @@ -72,7 +73,11 @@ def __post_init__(self): class ConfigProblemBase(DefaultConfigProblemBase): output_directory: str = f"output/{os.path.basename(__file__).split('.')[0]}" experiment_name: str = field(default_factory=generate_experiment_name) - llm_backbone: str = "t5-small" + llm_backbone: str = ( + "t5-small" + if "t5-small" in default_cfg.default_sequence_to_sequence_models + else default_cfg.default_sequence_to_sequence_models[0] + ) dataset: ConfigNLPSeq2SeqDataset = field(default_factory=ConfigNLPSeq2SeqDataset) tokenizer: ConfigNLPCausalLMTokenizer = field( @@ -99,15 +104,7 @@ def __post_init__(self): self._visibility["output_directory"] = -1 self._possible_values["llm_backbone"] = possible_values.String( - values=( - "t5-small", - "t5-base", - "t5-large", - "google/flan-t5-small", - "google/flan-t5-base", - "google/flan-t5-large", - "google/flan-ul2", - ), + values=default_cfg.default_sequence_to_sequence_models, allow_custom=True, )