Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reasonable LLM fine-tuning defaults #3549

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ludwig/schema/model_types/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ludwig.schema.llms.quantization import QuantizationConfig, QuantizationConfigField
from ludwig.schema.model_types.base import ModelConfig, register_model_type
from ludwig.schema.preprocessing import PreprocessingConfig, PreprocessingField
from ludwig.schema.trainer import LLMTrainerConfig, LLMTrainerDataclassField
from ludwig.schema.trainer import BaseTrainerConfig, LLMTrainerDataclassField
from ludwig.schema.utils import ludwig_dataclass


Expand All @@ -42,8 +42,8 @@ class LLMModelConfig(ModelConfig):

prompt: PromptConfig = PromptConfigField().get_default_field()

# trainer: LLMTrainerConfig = LLMTrainerField().get_default_field()
trainer: LLMTrainerConfig = LLMTrainerDataclassField(
# trainer: BaseTrainerConfig = LLMTrainerField().get_default_field()
trainer: BaseTrainerConfig = LLMTrainerDataclassField(
description="The trainer to use for the model",
)

Expand Down
114 changes: 75 additions & 39 deletions ludwig/schema/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,32 @@
_llm_trainer_schema_registry = Registry()


LEARNING_RATE_DESCRIPTION = (
"Controls how much to change the model in response to the estimated error each time the model weights are "
"updated. If 'auto', the optimal learning rate is estimated by choosing the learning rate that produces "
"the smallest non-diverging gradient update."
)

EFFECTIVE_BATCH_SIZE_DESCRIPTION = (
"The effective batch size is the total number of samples used to compute a single gradient update "
"to the model weights. This differs from `batch_size` by taking `gradient_accumulation_steps` and number "
"of training worker processes into account. In practice, "
"`effective_batch_size = batch_size * gradient_accumulation_steps * num_workers`. "
"If 'auto', the effective batch size is derivied implicitly from `batch_size`, but if set explicitly, then "
"one of `batch_size` or `gradient_accumulation_steps` must be set to something other than 'auto', and "
"consequently will be set following the formula given above."
)

BATCH_SIZE_DESCRIPTION = (
"The number of training examples utilized in one training step of the model. If ’auto’, the "
"batch size that maximized training throughput (samples / sec) will be used. For CPU training, the "
"tuned batch size is capped at 128 as throughput benefits of large batch sizes are less noticeable without "
"a GPU."
)

EPOCHS_DESCRIPTION = "Number of epochs the algorithm is intended to be run over. Overridden if `train_steps` is set"


@DeveloperAPI
def register_trainer_schema(model_type: str):
def wrap(trainer_config: BaseTrainerConfig):
Expand Down Expand Up @@ -135,11 +161,7 @@ def __post_init__(self):
learning_rate: Union[float, str] = schema_utils.OneOfOptionsField(
default=0.001,
allow_none=False,
description=(
"Controls how much to change the model in response to the estimated error each time the model weights are "
"updated. If 'auto', the optimal learning rate is estimated by choosing the learning rate that produces "
"the smallest non-diverging gradient update."
),
description=LEARNING_RATE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate"],
field_options=[
schema_utils.FloatRange(default=0.001, allow_none=False, min=0, max=1),
Expand All @@ -154,7 +176,7 @@ def __post_init__(self):

epochs: int = schema_utils.PositiveInteger(
default=100,
description="Number of epochs the algorithm is intended to be run over. Overridden if `train_steps` is set",
description=EPOCHS_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["epochs"],
)

Expand Down Expand Up @@ -189,15 +211,7 @@ def __post_init__(self):
effective_batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
default=AUTO,
allow_none=False,
description=(
"The effective batch size is the total number of samples used to compute a single gradient update "
"to the model weights. This differs from `batch_size` by taking `gradient_accumulation_steps` and number "
"of training worker processes into account. In practice, "
"`effective_batch_size = batch_size * gradient_accumulation_steps * num_workers`. "
"If 'auto', the effective batch size is derivied implicitly from `batch_size`, but if set explicitly, then "
"one of `batch_size` or `gradient_accumulation_steps` must be set to something other than 'auto', and "
"consequently will be set following the formula given above."
),
description=EFFECTIVE_BATCH_SIZE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["effective_batch_size"],
field_options=[
schema_utils.PositiveInteger(default=128, description="", allow_none=False),
Expand All @@ -208,12 +222,7 @@ def __post_init__(self):
batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
default=AUTO,
allow_none=False,
description=(
"The number of training examples utilized in one training step of the model. If ’auto’, the "
"batch size that maximized training throughput (samples / sec) will be used. For CPU training, the "
"tuned batch size is capped at 128 as throughput benefits of large batch sizes are less noticeable without "
"a GPU."
),
description=BATCH_SIZE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["batch_size"],
field_options=[
schema_utils.PositiveInteger(default=128, description="", allow_none=False),
Expand Down Expand Up @@ -766,10 +775,17 @@ def can_tune_batch_size(self) -> bool:


@DeveloperAPI
@register_llm_trainer_schema("none")
@ludwig_dataclass
class LLMTrainerConfig(BaseTrainerConfig):
class NoneTrainerConfig(BaseTrainerConfig):
"""Base class for all LLM trainer configs."""

type: str = schema_utils.ProtectedString(
"none",
description="The type of trainer used to train the model. ",
parameter_metadata=TRAINER_METADATA[MODEL_LLM]["type"],
)

learning_rate: Union[float, str] = schema_utils.OneOfOptionsField(
default=0.0001,
allow_none=False,
Expand Down Expand Up @@ -840,20 +856,6 @@ class LLMTrainerConfig(BaseTrainerConfig):
description="Whether to evaluate the training set in the LLM trainer. Note: this operation may be slow.",
)


@DeveloperAPI
@register_llm_trainer_schema("none")
@ludwig_dataclass
class NoneTrainerConfig(LLMTrainerConfig):
"""Dataclass that configures most of the hyperparameters used for zero-shot / few-shot LLM model training."""

# Required for lookup during trainer initialization
type: str = schema_utils.ProtectedString(
"none",
description="The type of trainer used to train the model. ",
parameter_metadata=TRAINER_METADATA[MODEL_LLM]["type"],
)

def can_tune_batch_size(self) -> bool:
return False

Expand All @@ -867,9 +869,43 @@ class FineTuneTrainerConfig(ECDTrainerConfig):
# Required for lookup during trainer initialization
type: str = schema_utils.ProtectedString("finetune")

base_learning_rate: float = schema_utils.NonNegativeFloat(
default=0.0,
description="Base learning rate used for training in the LLM trainer.",
learning_rate: Union[float, str] = schema_utils.OneOfOptionsField(
default=0.0001,
allow_none=False,
description=LEARNING_RATE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["learning_rate"],
field_options=[
schema_utils.FloatRange(default=0.001, allow_none=False, min=0, max=1),
schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
],
)

effective_batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
default=32,
allow_none=False,
description=EFFECTIVE_BATCH_SIZE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["effective_batch_size"],
field_options=[
schema_utils.PositiveInteger(default=32, description="", allow_none=False),
schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
],
)

batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
default=AUTO,
allow_none=False,
description=BATCH_SIZE_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["batch_size"],
field_options=[
schema_utils.PositiveInteger(default=1, description="", allow_none=False),
schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
],
)

epochs: int = schema_utils.PositiveInteger(
default=3,
description=EPOCHS_DESCRIPTION,
parameter_metadata=TRAINER_METADATA[MODEL_ECD]["epochs"],
)


Expand Down
4 changes: 2 additions & 2 deletions ludwig/schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,15 @@ def load_trainer_with_kwargs(
otherwise passes all other parameters through without change.
"""
from ludwig.constants import MODEL_ECD, MODEL_GBM, MODEL_LLM
from ludwig.schema.trainer import ECDTrainerConfig, GBMTrainerConfig, LLMTrainerConfig
from ludwig.schema.trainer import ECDTrainerConfig, GBMTrainerConfig, NoneTrainerConfig

# TODO: use registry pattern for trainers
if model_type == MODEL_ECD:
trainer_schema = ECDTrainerConfig
elif model_type == MODEL_GBM:
trainer_schema = GBMTrainerConfig
elif model_type == MODEL_LLM:
trainer_schema = LLMTrainerConfig
trainer_schema = NoneTrainerConfig

return load_config_with_kwargs(trainer_schema, kwargs)

Expand Down
Loading