-
Notifications
You must be signed in to change notification settings - Fork 0
/
activations_tf.py
152 lines (115 loc) · 4.79 KB
/
activations_tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import tensorflow as tf
from packaging import version
def _gelu(x):
"""
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
https://arxiv.org/abs/1606.08415
"""
x = tf.convert_to_tensor(x)
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype)))
return x * cdf
def _gelu_bf16(x):
"""
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
https://arxiv.org/abs/1606.08415
"""
x = tf.convert_to_tensor(x)
cdf = tf.cast(0.5, x.dtype) * (
tf.cast(1.0, x.dtype) + tf.math.erf(x / tf.cast(1.4142135623730951, x.dtype))
)
return x * cdf
def _gelu_new(x):
"""
Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841
Args:
x: float Tensor to perform activation
Returns:
`x` with the GELU activation applied.
"""
x = tf.convert_to_tensor(x)
pi = tf.cast(math.pi, x.dtype)
coeff = tf.cast(0.044715, x.dtype)
cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3))))
return x * cdf
def mish(x):
x = tf.convert_to_tensor(x)
return x * tf.tanh(tf.math.softplus(x))
def gelu_fast(x):
x = tf.convert_to_tensor(x)
coeff1 = tf.cast(0.044715, x.dtype)
coeff2 = tf.cast(0.7978845608, x.dtype)
return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x)))
def quick_gelu(x):
x = tf.convert_to_tensor(x)
coeff = tf.cast(1.702, x.dtype)
return x * tf.math.sigmoid(coeff * x)
def gelu_10(x):
"""
Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as
it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://arxiv.org/abs/2004.09602
Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see
https://arxiv.org/abs/1606.08415 :param x: :return:
"""
return tf.clip_by_value(_gelu(x), -10, 10)
def glu(x, axis=-1):
"""
Gated Linear Unit. Implementation as defined in the original paper (see https://arxiv.org/abs/1612.08083), where
the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B).
Args:
`x`: float Tensor to perform activation
`axis`: dimension across which `x` be split in half
Returns:
`x` with the GLU activation applied (with its size halved across the dimension `axis`).
"""
a, b = tf.split(x, 2, axis=axis)
return a * tf.math.sigmoid(b)
if version.parse(tf.version.VERSION) >= version.parse("2.4"):
def approximate_gelu_wrap(x):
return tf.keras.activations.gelu(x, approximate=True)
gelu = tf.keras.activations.gelu
gelu_new = approximate_gelu_wrap
else:
gelu = _gelu
gelu_new = _gelu_new
ACT2FN = {
"gelu": gelu,
"relu": tf.keras.activations.relu,
"swish": tf.keras.activations.swish,
"silu": tf.keras.activations.swish,
"gelu_new": gelu_new,
"mish": mish,
"tanh": tf.keras.activations.tanh,
"gelu_fast": gelu_fast,
"quick_gelu": quick_gelu,
"gelu_10": gelu_10,
"glu": glu,
"gelu_bf16": _gelu_bf16,
}
def get_tf_activation(activation_string):
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(
f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
)