-
Notifications
You must be signed in to change notification settings - Fork 18
/
mamba_config.py
86 lines (73 loc) · 2.7 KB
/
mamba_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from dataclasses import dataclass
from typing import Callable
import torch
import torch.nn.functional as F
from utils import init_method_normal, scaled_init_method_normal
@dataclass
class MambaConfig():
base_model_type: str = "mamba"
num_layers: int = 0
hidden_size: int = 0
state_size: int = 0
vocab_size: int = 50000
expansion_factor: int = 2
conv_dimension: int = 0
conv_bias: bool = True
bias: bool = True
use_fast_path: bool = True
dt_rank: str = "auto"
dt_min: float = 0.001
dt_max: float = 0.1
dt_init: str = "random"
dt_scale: float = 1.0
dt_init_floor: float = 1e-4
rms_norm: bool = True
fused_add_norm: bool = False
residual_in_fp32: bool = True
hidden_dropout: float = 0.0
ffn_hidden_size: int = None
gated_linear_unit: bool = False
mamba_moe_layers: str = ""
routing_mode: str = "sinkhorn"
device: str = "cuda"
fp32_residual_connection: bool = False
layernorm_epsilon: float = 1e-5
layernorm_zero_centered_gamma: bool = False
add_bias_linear: bool = True
activation_func: Callable = F.gelu
num_moe_experts: int = None
# initialization
init_method: Callable = None
output_layer_init_method: Callable = None
init_method_std: float = 0.02
# mixed-precision
apply_query_key_layer_scaling: bool = True
attention_softmax_in_fp32: bool = True
# fusion
gated_linear_unit: bool = False
bias_gelu_fusion: bool = False
persist_layer_norm: bool = False
bias_dropout_fusion: bool = False
def __post_init__(self):
""" Python dataclass method that is used to modify attributes after initialization.
See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
"""
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
if self.ffn_hidden_size is None:
self.ffn_hidden_size = 4 * self.hidden_size
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
if self.bias_gelu_fusion:
if not self.add_bias_linear:
raise ValueError(
"When bias_gelu_fusion is True, add_bias_linear must also be True."
)
if self.activation_func != F.gelu:
raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
if self.init_method is None:
self.init_method = init_method_normal(self.init_method_std)
if self.output_layer_init_method is None:
self.output_layer_init_method = scaled_init_method_normal(
self.init_method_std, self.num_layers
)