From 1995c1356ad2017685643c3abf0be6ddcd24abc9 Mon Sep 17 00:00:00 2001 From: lihaibineric Date: Wed, 28 Feb 2024 14:14:32 +0800 Subject: [PATCH] Site updated: 2024-02-28 14:14:31 --- 2024/01/30/dl_summary/index.html | 8 ++++---- local-search.xml | 2 +- search.xml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/2024/01/30/dl_summary/index.html b/2024/01/30/dl_summary/index.html index 5808444..880ba63 100644 --- a/2024/01/30/dl_summary/index.html +++ b/2024/01/30/dl_summary/index.html @@ -25,7 +25,7 @@ - + @@ -211,7 +211,7 @@ - 9k words + 9.7k words @@ -222,7 +222,7 @@ - 75 mins + 82 mins @@ -433,7 +433,7 @@

实现参数稀疏

Batch Normalization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class MyBN:
def __init__(self, momentum=0.01, eps=1e-5, feat_dim=2):
self._running_mean = np.zeros(shape = (feat_dim,))
self._running_var = np.ones(shape = (fear_dim,))
self._momentum = momentum
#防止分母计算为0
self._eps = eps

#对应batch norm中需要更新beta 和 gamma, 采用pytorch文档中的初始化
self._beta = np.zeros(shape=(feat_dim,))
self._gamma = np.ones(shape=(feat_dim,))


def batch_norm(self, x):
if self.training:
x_mean = x.mean(axis=0)
x_var = x.var(axis=0)
#对应running_mean的更新公式
self._running_mean = (1-self._momentum)*x_mean +self._momentum*self._running_mean
self._running_var = (1-self._momentum)*x_var + self._momentum*self._running_var
#对应论文中计算BN公式
x_hat = (x-x_mean)/np.sqrt(x_var+self._eps)
else:
x_hat = (x-self._running_mean)/np.sqrt(self._running_var+self._eps)
return self._gamma*x_hat + self._beta

Transformer结构

-
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.depth = d_model // num_heads

self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)

self.dense = nn.Linear(d_model, d_model)

def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.depth)
return x.permute(0, 2, 1, 3)

def forward(self, q, k, v, mask=None):
batch_size = q.size(0)

q = self.wq(q)
k = self.wk(k)
v = self.wv(v)

q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)

scaled_attention_logits = torch.matmul(q, k.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
if mask is not None:
scaled_attention_logits += (mask * -1e9)

attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
output = torch.matmul(attention_weights, v)

output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
output = self.dense(output)

return output, attention_weights

class PositionwiseFeedForward(nn.Module):
def __init__(self, d_model, dff):
super(PositionwiseFeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, dff)
self.fc2 = nn.Linear(dff, d_model)

def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x

class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
super(TransformerBlock, self).__init__()

self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = PositionwiseFeedForward(d_model, dff)

self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)

self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)

self.dropout_rate = dropout_rate

def forward(self, x, mask=None):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(x + attn_output)

ffn_output = self.ffn(out1)

+
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads

self.query_fc = nn.Linear(embed_dim, embed_dim)
self.key_fc = nn.Linear(embed_dim, embed_dim)
self.value_fc = nn.Linear(embed_dim, embed_dim)
self.fc_out = nn.Linear(embed_dim, embed_dim)

def forward(self, query, key, value, mask=None):
batch_size = query.shape[0]

# Linearly project queries, keys, and values
Q = self.query_fc(query)
K = self.key_fc(key)
V = self.value_fc(value)

# Split the embedding into num_heads
Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
K = K.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
V = V.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

# Calculate the attention scores
scores = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))

if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))

# Apply softmax to get attention probabilities
attention_weights = F.softmax(scores, dim=-1)

# Apply dropout
attention_weights = F.dropout(attention_weights, p=0.1, training=self.training)

# Multiply the attention weights with the values
output = torch.matmul(attention_weights, V)

# Concatenate multi-heads and project
output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, embed_dim)
output = self.fc_out(output)

return output, attention_weights

class PositionwiseFeedforward(nn.Module):
def __init__(self, embed_dim, hidden_dim):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, embed_dim)

def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x

class EncoderLayer(nn.Module):
def __init__(self, embed_dim, num_heads, hidden_dim):
super(EncoderLayer, self).__init__()
self.multihead_attention = MultiHeadAttention(embed_dim, num_heads)
self.feed_forward = PositionwiseFeedforward(embed_dim, hidden_dim)
self.layer_norm1 = nn.LayerNorm(embed_dim)
self.layer_norm2 = nn.LayerNorm(embed_dim)

def forward(self, x, mask=None):
# Multi-Head Attention
residual = x
x, _ = self.multihead_attention(x, x, x, mask)
x = self.layer_norm1(x + residual)

# Feed Forward
residual = x
x = self.feed_forward(x)
x = self.layer_norm2(x + residual)

return x

class TransformerEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, hidden_dim):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.layers = nn.ModuleList([EncoderLayer(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)])

def forward(self, x, mask=None):
x = self.embedding(x)
for layer in self.layers:
x = layer(x, mask)
return x

diff --git a/local-search.xml b/local-search.xml index dd4db66..6b901e2 100644 --- a/local-search.xml +++ b/local-search.xml @@ -64,7 +64,7 @@ /2024/01/30/dl_summary/ - 深度学习知识汇总

深度学习八股文,这里将会收集深度学习中的基本概念和常见的问题

https://blog.csdn.net/weixin_42693876/article/details/120345924

L2范数https://blog.csdn.net/u010725283/article/details/79212762

L1L2范数 https://blog.csdn.net/weixin_35849560/article/details/113395018

Transformer为什么用多头 https://www.zhihu.com/question/341222779

Transformer里的LN https://blog.csdn.net/weixin_45069761/article/details/107834049

https://zhuanlan.zhihu.com/p/560482252

batch和minibatch https://link.zhihu.com/?target=https%3A//blog.csdn.net/xys430381_1/article/details/80680167

优化器 https://zhuanlan.zhihu.com/p/78622301

BN https://zhuanlan.zhihu.com/p/93643523

神经网络权重初始化 https://blog.csdn.net/kebu12345678/article/details/103084851

https://zhuanlan.zhihu.com/p/667048896

https://zhuanlan.zhihu.com/p/643560888

bert模型细节 https://www.zhihu.com/question/534763354

为什么Bert三个embedding可以相加 https://www.zhihu.com/question/374835153/answer/1080315948

LLAMA2结构https://blog.csdn.net/sikh_0529/article/details/134375318

旋转位置嵌入https://www.zhihu.com/tardis/zm/art/647109286?source_id=1005

Qlora https://zhuanlan.zhihu.com/p/618894919

RLHF https://zhuanlan.zhihu.com/p/631238431

    [https://zhuanlan.zhihu.com/p/599016986](https://zhuanlan.zhihu.com/p/599016986)

逻辑回归和线性回归

线性回归解决的是回归问题,逻辑回归相当于是线性回归的基础上,来解决分类问题

线性回归(Linear Regression) \[\begin{aligned}&f_{w, b}(x)=\sum_i w_i x_i+b\\\end{aligned}\] 逻辑回归(Logistic Regression) $$\[\begin{aligned}&f_{w, b}(x)=\sigma\left(\sum_i w_i x_i+b\right)\end{aligned}\]

$$ 逻辑回归可以理解为在线性回归后加了一个 sigmoid函数。将线性回归变成一个0~1输出的分类问题。逻辑回归本质上是一个线性回归模型,因为除去sigmoid映射函数关系,其他的步骤,算法都是线性回归的。可以说,逻辑回归都是以线性回归为理论支持的,只不过逻辑回归可以轻松解决0/1 分类问题。

深度学习模型的参数都在 0-1之间

因为参数越小代表模型越简单,越是复杂的模型,越是尝试对所有样本进行拟合,包括异常点。这就会造成在较小的区间中产生较大的波动,这个较大的波动也会反映在这个区间的导数比较大。只有越大的参数才可能产生较大的导数。因此参数越小,模型就越简单。

实现参数稀疏

参数的稀疏,在一定程度上实现了特征的选择。一般而言,大部分特征对模型是没有贡献的。这些没有用的特征虽然可以减少训练集上的误差,但是对测试集的样本,反而会产生干扰。稀疏参数的引入,可以将那些无用的特征的权重置为0

Batch_size的大小对学习率的影响
  • batch-size大,学习率也可以取得大一点,而且,batch-size大通常更新次数少,所以需要更多的epoch才能让loss收敛。
  • batch-size小,学习率应该取得小一点,取的大会发生nan(梯度爆炸了),batch-size小通常更新次数多,较少的epoch就课可以让loss收敛,但是缺点是训练过程慢。

为什么batch-size小,学习率取的大会发生nan?学习率较高的情况下,直接影响到每次更新值的程度比较大,走的步伐因此也会大起来。如下图,过大的学习率会导致无法顺利地到达最低点,稍有不慎就会跳出可控制区域,此时我们将要面对的就是损失成倍增大(跨量级)

优化器optimizer和损失函数loss function的区别:

  1. 优化器定义了哪些参数是要用来更新的,并且设置了更新的方式(学习率、动量、SGD等),还有一些权重衰减的设置。
  2. 损失函数是用来计算损失的,也可以说损失函数是负责反向传播求导用的

残差结构设计思想:残差网络的本质也是解决梯度消失/爆炸的问题,只不过是在网络结构层面的改变残差网络的出现解决了构建深层神经网络时网络退化即梯度消失/爆炸的问题。残差结构主要设计有两个,快捷连接(shortcutconnection)和恒等映射(identitymapping),快捷连接使得残差变得可能,而恒等映射使得网络变深,恒等映射主要有两个:跳跃连接和激活函数

Adam与SGD的区别

SGD缺点是其更新方向完全依赖于当前batch计算出的梯度,因而十分不稳定。

Adam的优点主要在于:

  • 考虑历史步中的梯度更新信息,能够降低梯度更新噪声。
  • 此外经过偏差校正后,每一次迭代学习率都有个确定范围,使得参数比较平稳。

但是Adam也有其自身问题:可能会对前期出现的特征过拟合,后期才出现的特征很难纠正前期的拟合效果。二者似乎都没法很好避免局部最优问题。

softmax如何防止指数上溢

在计算softmax函数时,指数上溢是一个常见的问题,特别是当输入的数值非常大时,指数函数的计算结果可能会溢出。为了解决这个问题,可以采取以下几种方法:

  1. 数值稳定性技巧:为了避免指数函数的溢出,可以将输入的数值减去一个常数,使得输入相对较小,从而减少指数函数的值。通常,可以通过找到输入向量中的最大值,并将所有元素减去这个最大值来实现数值稳定性。

    image-20240222173542613

    这样做可以保持相对稳定,防止指数函数的溢出。

  2. 利用性质:softmax函数的分子和分母同时除以一个相同的常数并不会改变函数的值。因此,我们可以在计算softmax时,将所有输入向量的值都减去向量中的最大值,然后进行softmax计算。

以上两种方法都可以有效地避免指数上溢的问题,并保持softmax函数的数值稳定性。在实际应用中,通常会使用这些技巧来计算softmax函数,以确保模型的稳定性和数值精度。

训练过程中发现loss快速增大应该从哪些方面考虑?

    1. 学习率过大
    2. 训练样本中有坏数据
  1. model.eval vs和torch.no_grad区别

    • model.eval():依然计算梯度,但是不反传;dropout层保留概率为1;batchnorm层使用全局的mean和var
    • with torch.no_grad: 不计算梯度
  2. Dropout和Batch norm能否一起使用?

  3. 可以,但是只能将Dropout放在Batchnorm之后使用。因为Dropout训练时会改变输入X的方差,从而影响Batchnorm训练过程中统计的滑动方差值;而测试时没有Dropout,输入X的方差和训练时不一致,这就导致Batchnorm测试时期望的方差和训练时统计的有偏差。

  4. 梯度消失和梯度爆炸

  5. 梯度消失的原因和解决办法

  6. (1)隐藏层的层数过多

  7. 反向传播求梯度时的链式求导法则,某部分梯度小于1,则多层连乘后出现梯度消失

  8. (2)采用了不合适的激活函数

  9. 如sigmoid函数的最大梯度为1/4,这意味着隐藏层每一层的梯度均小于1(权值小于1时),出现梯度消失。

  10. 解决方法:1、relu激活函数,使导数衡为1 2、batch norm3、残差结构

  11. 梯度爆炸的原因和解决办法

  12. (1)隐藏层的层数过多,某部分梯度大于1,则多层连乘后,梯度呈指数增长,产生梯度爆炸。

  13. (2)权重初始值太大,求导时会乘上权重

  14. 解决方法:1、梯度裁剪 2、权重L1/L2正则化 3、残差结构 4、batchnorm

  15. Batch Normalization(Batch Norm)缺点:在处理序列数据(如文本)时,BatchNorm可能不会表现得很好,因为序列数据通常长度不一,并且一次训练的Batch中的句子的长度可能会有很大的差异;此外,BatchNorm对于Batch大小也非常敏感。对于较小的Batch大小,BatchNorm可能会表现得不好,因为每个Batch的统计特性可能会有较大的波动。

  16. Layer Normalization(Layer Norm)优点:LayerNorm是对每个样本进行归一化,因此它对Batch大小不敏感,这使得它在处理序列数据时表现得更好;另外,LayerNorm在处理不同长度的序列时也更为灵活。

  17. Instance Normalization(Instance Norm)优点:InstanceNorm是对每个样本的每个特征进行归一化,因此它可以捕捉到更多的细节信息。InstanceNorm在某些任务,如风格迁移,中表现得很好,因为在这些任务中,细节信息很重要。缺点:InstanceNorm可能会过度强调细节信息,忽视了更宏观的信息。此外,InstanceNorm的计算成本相比Batch Norm和Layer Norm更高。

  18. Group Normalization(Group Norm)优点:Group Norm是Batch Norm和InstanceNorm的折中方案,它在Batch的一个子集(即组)上进行归一化。这使得GroupNorm既可以捕捉到Batch的统计特性,又可以捕捉到样本的细节信息。此外,GroupNorm对Batch大小也不敏感。 缺点:GroupNorm的性能取决于组的大小,需要通过实验来确定最优的组大小。此外,GroupNorm的计算成本也比Batch Norm和Layer Norm更高。

pytorch实现自注意力和多头注意力

自注意力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from math import sqrt
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
def __init__(self, dim_in, dim_k, dim_v):
super(SelfAttention, self).__init__()
self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
self._norm_fact = 1/sqrt(dim_k)


def forward(self, x):
batch, n, dim_in = x.shape
assert dim_in == self.dim_in

q = self.linear_q(x) #batch, n, dim_k
k = self.linear_k(x)
v = self.linear_v(x)

dist = torch.bmm(q, k.transpose(1,2))* self._norm_fact #batch, n, n
dist = torch.softmax(dist, dim=-1)

att = torch.bmm(dist, v)
return att

多头注意力机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from math import sqrt
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
#dim_in input dimention
#dim_k kq dimention
#dim_v value dimention
#num_heads number of heads

def __init__(self, dim_in, dim_k, dim_v, num_heads=8):
super(MultiHeadAttention, self).__init__()
assert dim_k% num_heads ==0 and dim_v% num_heads ==0

self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.num_heads = num_heads
self.linear_q = nn.Linear(dim_in, dim_k, bias==False)
self.linear_k = nn.Linear(dim_in, dim_k, bias==False)
self.linear_v = nn.Linear(dim_in, dim_v, bias==False)
self._norm_fact = 1/sqrt(dim_k//num_heads)

def forwards(self, x):
# x: tensor of shape(batch, n, dim_in)
batch, n, dim_in = x.shape
assert dim_in = self.dim_in

nh = self.num_heads
dk = self.dim_k // nh
dv = self.dim_v // nh

q = self.linear_q(x).reshape(batch, n, nh, dk).transpose(1, 2)
k = self.linear_k(x).reshape(batch, n, nh, dk).transpose(1, 2)
v = self.linear_v(x).reshape(batch, n, nk, dk).transpose(1, 2)

dist = torch.matmul(q, k.transpose(2,3))*self._norm_fact
dist = torch.softmax(dist, dim=-1)

att = torch.matmul(dist, v)
att = att.transpose(1,2).reshape(batch, n, self.dim_v)

Batch Normalization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class MyBN:
def __init__(self, momentum=0.01, eps=1e-5, feat_dim=2):
self._running_mean = np.zeros(shape = (feat_dim,))
self._running_var = np.ones(shape = (fear_dim,))
self._momentum = momentum
#防止分母计算为0
self._eps = eps

#对应batch norm中需要更新beta 和 gamma, 采用pytorch文档中的初始化
self._beta = np.zeros(shape=(feat_dim,))
self._gamma = np.ones(shape=(feat_dim,))


def batch_norm(self, x):
if self.training:
x_mean = x.mean(axis=0)
x_var = x.var(axis=0)
#对应running_mean的更新公式
self._running_mean = (1-self._momentum)*x_mean +self._momentum*self._running_mean
self._running_var = (1-self._momentum)*x_var + self._momentum*self._running_var
#对应论文中计算BN公式
x_hat = (x-x_mean)/np.sqrt(x_var+self._eps)
else:
x_hat = (x-self._running_mean)/np.sqrt(self._running_var+self._eps)
return self._gamma*x_hat + self._beta

Transformer结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.depth = d_model // num_heads

self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)

self.dense = nn.Linear(d_model, d_model)

def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.depth)
return x.permute(0, 2, 1, 3)

def forward(self, q, k, v, mask=None):
batch_size = q.size(0)

q = self.wq(q)
k = self.wk(k)
v = self.wv(v)

q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)

scaled_attention_logits = torch.matmul(q, k.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
if mask is not None:
scaled_attention_logits += (mask * -1e9)

attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
output = torch.matmul(attention_weights, v)

output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
output = self.dense(output)

return output, attention_weights

class PositionwiseFeedForward(nn.Module):
def __init__(self, d_model, dff):
super(PositionwiseFeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, dff)
self.fc2 = nn.Linear(dff, d_model)

def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x

class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
super(TransformerBlock, self).__init__()

self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = PositionwiseFeedForward(d_model, dff)

self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)

self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)

self.dropout_rate = dropout_rate

def forward(self, x, mask=None):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(x + attn_output)

ffn_output = self.ffn(out1)

]]>
+ 深度学习知识汇总

深度学习八股文,这里将会收集深度学习中的基本概念和常见的问题

https://blog.csdn.net/weixin_42693876/article/details/120345924

L2范数https://blog.csdn.net/u010725283/article/details/79212762

L1L2范数 https://blog.csdn.net/weixin_35849560/article/details/113395018

Transformer为什么用多头 https://www.zhihu.com/question/341222779

Transformer里的LN https://blog.csdn.net/weixin_45069761/article/details/107834049

https://zhuanlan.zhihu.com/p/560482252

batch和minibatch https://link.zhihu.com/?target=https%3A//blog.csdn.net/xys430381_1/article/details/80680167

优化器 https://zhuanlan.zhihu.com/p/78622301

BN https://zhuanlan.zhihu.com/p/93643523

神经网络权重初始化 https://blog.csdn.net/kebu12345678/article/details/103084851

https://zhuanlan.zhihu.com/p/667048896

https://zhuanlan.zhihu.com/p/643560888

bert模型细节 https://www.zhihu.com/question/534763354

为什么Bert三个embedding可以相加 https://www.zhihu.com/question/374835153/answer/1080315948

LLAMA2结构https://blog.csdn.net/sikh_0529/article/details/134375318

旋转位置嵌入https://www.zhihu.com/tardis/zm/art/647109286?source_id=1005

Qlora https://zhuanlan.zhihu.com/p/618894919

RLHF https://zhuanlan.zhihu.com/p/631238431

    [https://zhuanlan.zhihu.com/p/599016986](https://zhuanlan.zhihu.com/p/599016986)

逻辑回归和线性回归

线性回归解决的是回归问题,逻辑回归相当于是线性回归的基础上,来解决分类问题

线性回归(Linear Regression) \[\begin{aligned}&f_{w, b}(x)=\sum_i w_i x_i+b\\\end{aligned}\] 逻辑回归(Logistic Regression) $$\[\begin{aligned}&f_{w, b}(x)=\sigma\left(\sum_i w_i x_i+b\right)\end{aligned}\]

$$ 逻辑回归可以理解为在线性回归后加了一个 sigmoid函数。将线性回归变成一个0~1输出的分类问题。逻辑回归本质上是一个线性回归模型,因为除去sigmoid映射函数关系,其他的步骤,算法都是线性回归的。可以说,逻辑回归都是以线性回归为理论支持的,只不过逻辑回归可以轻松解决0/1 分类问题。

深度学习模型的参数都在 0-1之间

因为参数越小代表模型越简单,越是复杂的模型,越是尝试对所有样本进行拟合,包括异常点。这就会造成在较小的区间中产生较大的波动,这个较大的波动也会反映在这个区间的导数比较大。只有越大的参数才可能产生较大的导数。因此参数越小,模型就越简单。

实现参数稀疏

参数的稀疏,在一定程度上实现了特征的选择。一般而言,大部分特征对模型是没有贡献的。这些没有用的特征虽然可以减少训练集上的误差,但是对测试集的样本,反而会产生干扰。稀疏参数的引入,可以将那些无用的特征的权重置为0

Batch_size的大小对学习率的影响
  • batch-size大,学习率也可以取得大一点,而且,batch-size大通常更新次数少,所以需要更多的epoch才能让loss收敛。
  • batch-size小,学习率应该取得小一点,取的大会发生nan(梯度爆炸了),batch-size小通常更新次数多,较少的epoch就课可以让loss收敛,但是缺点是训练过程慢。

为什么batch-size小,学习率取的大会发生nan?学习率较高的情况下,直接影响到每次更新值的程度比较大,走的步伐因此也会大起来。如下图,过大的学习率会导致无法顺利地到达最低点,稍有不慎就会跳出可控制区域,此时我们将要面对的就是损失成倍增大(跨量级)

优化器optimizer和损失函数loss function的区别:

  1. 优化器定义了哪些参数是要用来更新的,并且设置了更新的方式(学习率、动量、SGD等),还有一些权重衰减的设置。
  2. 损失函数是用来计算损失的,也可以说损失函数是负责反向传播求导用的

残差结构设计思想:残差网络的本质也是解决梯度消失/爆炸的问题,只不过是在网络结构层面的改变残差网络的出现解决了构建深层神经网络时网络退化即梯度消失/爆炸的问题。残差结构主要设计有两个,快捷连接(shortcutconnection)和恒等映射(identitymapping),快捷连接使得残差变得可能,而恒等映射使得网络变深,恒等映射主要有两个:跳跃连接和激活函数

Adam与SGD的区别

SGD缺点是其更新方向完全依赖于当前batch计算出的梯度,因而十分不稳定。

Adam的优点主要在于:

  • 考虑历史步中的梯度更新信息,能够降低梯度更新噪声。
  • 此外经过偏差校正后,每一次迭代学习率都有个确定范围,使得参数比较平稳。

但是Adam也有其自身问题:可能会对前期出现的特征过拟合,后期才出现的特征很难纠正前期的拟合效果。二者似乎都没法很好避免局部最优问题。

softmax如何防止指数上溢

在计算softmax函数时,指数上溢是一个常见的问题,特别是当输入的数值非常大时,指数函数的计算结果可能会溢出。为了解决这个问题,可以采取以下几种方法:

  1. 数值稳定性技巧:为了避免指数函数的溢出,可以将输入的数值减去一个常数,使得输入相对较小,从而减少指数函数的值。通常,可以通过找到输入向量中的最大值,并将所有元素减去这个最大值来实现数值稳定性。

    image-20240222173542613

    这样做可以保持相对稳定,防止指数函数的溢出。

  2. 利用性质:softmax函数的分子和分母同时除以一个相同的常数并不会改变函数的值。因此,我们可以在计算softmax时,将所有输入向量的值都减去向量中的最大值,然后进行softmax计算。

以上两种方法都可以有效地避免指数上溢的问题,并保持softmax函数的数值稳定性。在实际应用中,通常会使用这些技巧来计算softmax函数,以确保模型的稳定性和数值精度。

训练过程中发现loss快速增大应该从哪些方面考虑?

    1. 学习率过大
    2. 训练样本中有坏数据
  1. model.eval vs和torch.no_grad区别

    • model.eval():依然计算梯度,但是不反传;dropout层保留概率为1;batchnorm层使用全局的mean和var
    • with torch.no_grad: 不计算梯度
  2. Dropout和Batch norm能否一起使用?

  3. 可以,但是只能将Dropout放在Batchnorm之后使用。因为Dropout训练时会改变输入X的方差,从而影响Batchnorm训练过程中统计的滑动方差值;而测试时没有Dropout,输入X的方差和训练时不一致,这就导致Batchnorm测试时期望的方差和训练时统计的有偏差。

  4. 梯度消失和梯度爆炸

  5. 梯度消失的原因和解决办法

  6. (1)隐藏层的层数过多

  7. 反向传播求梯度时的链式求导法则,某部分梯度小于1,则多层连乘后出现梯度消失

  8. (2)采用了不合适的激活函数

  9. 如sigmoid函数的最大梯度为1/4,这意味着隐藏层每一层的梯度均小于1(权值小于1时),出现梯度消失。

  10. 解决方法:1、relu激活函数,使导数衡为1 2、batch norm3、残差结构

  11. 梯度爆炸的原因和解决办法

  12. (1)隐藏层的层数过多,某部分梯度大于1,则多层连乘后,梯度呈指数增长,产生梯度爆炸。

  13. (2)权重初始值太大,求导时会乘上权重

  14. 解决方法:1、梯度裁剪 2、权重L1/L2正则化 3、残差结构 4、batchnorm

  15. Batch Normalization(Batch Norm)缺点:在处理序列数据(如文本)时,BatchNorm可能不会表现得很好,因为序列数据通常长度不一,并且一次训练的Batch中的句子的长度可能会有很大的差异;此外,BatchNorm对于Batch大小也非常敏感。对于较小的Batch大小,BatchNorm可能会表现得不好,因为每个Batch的统计特性可能会有较大的波动。

  16. Layer Normalization(Layer Norm)优点:LayerNorm是对每个样本进行归一化,因此它对Batch大小不敏感,这使得它在处理序列数据时表现得更好;另外,LayerNorm在处理不同长度的序列时也更为灵活。

  17. Instance Normalization(Instance Norm)优点:InstanceNorm是对每个样本的每个特征进行归一化,因此它可以捕捉到更多的细节信息。InstanceNorm在某些任务,如风格迁移,中表现得很好,因为在这些任务中,细节信息很重要。缺点:InstanceNorm可能会过度强调细节信息,忽视了更宏观的信息。此外,InstanceNorm的计算成本相比Batch Norm和Layer Norm更高。

  18. Group Normalization(Group Norm)优点:Group Norm是Batch Norm和InstanceNorm的折中方案,它在Batch的一个子集(即组)上进行归一化。这使得GroupNorm既可以捕捉到Batch的统计特性,又可以捕捉到样本的细节信息。此外,GroupNorm对Batch大小也不敏感。 缺点:GroupNorm的性能取决于组的大小,需要通过实验来确定最优的组大小。此外,GroupNorm的计算成本也比Batch Norm和Layer Norm更高。

pytorch实现自注意力和多头注意力

自注意力

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from math import sqrt
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
def __init__(self, dim_in, dim_k, dim_v):
super(SelfAttention, self).__init__()
self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.linear_q = nn.Linear(dim_in, dim_k, bias=False)
self.linear_k = nn.Linear(dim_in, dim_k, bias=False)
self.linear_v = nn.Linear(dim_in, dim_v, bias=False)
self._norm_fact = 1/sqrt(dim_k)


def forward(self, x):
batch, n, dim_in = x.shape
assert dim_in == self.dim_in

q = self.linear_q(x) #batch, n, dim_k
k = self.linear_k(x)
v = self.linear_v(x)

dist = torch.bmm(q, k.transpose(1,2))* self._norm_fact #batch, n, n
dist = torch.softmax(dist, dim=-1)

att = torch.bmm(dist, v)
return att

多头注意力机制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from math import sqrt
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
#dim_in input dimention
#dim_k kq dimention
#dim_v value dimention
#num_heads number of heads

def __init__(self, dim_in, dim_k, dim_v, num_heads=8):
super(MultiHeadAttention, self).__init__()
assert dim_k% num_heads ==0 and dim_v% num_heads ==0

self.dim_in = dim_in
self.dim_k = dim_k
self.dim_v = dim_v
self.num_heads = num_heads
self.linear_q = nn.Linear(dim_in, dim_k, bias==False)
self.linear_k = nn.Linear(dim_in, dim_k, bias==False)
self.linear_v = nn.Linear(dim_in, dim_v, bias==False)
self._norm_fact = 1/sqrt(dim_k//num_heads)

def forwards(self, x):
# x: tensor of shape(batch, n, dim_in)
batch, n, dim_in = x.shape
assert dim_in = self.dim_in

nh = self.num_heads
dk = self.dim_k // nh
dv = self.dim_v // nh

q = self.linear_q(x).reshape(batch, n, nh, dk).transpose(1, 2)
k = self.linear_k(x).reshape(batch, n, nh, dk).transpose(1, 2)
v = self.linear_v(x).reshape(batch, n, nk, dk).transpose(1, 2)

dist = torch.matmul(q, k.transpose(2,3))*self._norm_fact
dist = torch.softmax(dist, dim=-1)

att = torch.matmul(dist, v)
att = att.transpose(1,2).reshape(batch, n, self.dim_v)

Batch Normalization

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class MyBN:
def __init__(self, momentum=0.01, eps=1e-5, feat_dim=2):
self._running_mean = np.zeros(shape = (feat_dim,))
self._running_var = np.ones(shape = (fear_dim,))
self._momentum = momentum
#防止分母计算为0
self._eps = eps

#对应batch norm中需要更新beta 和 gamma, 采用pytorch文档中的初始化
self._beta = np.zeros(shape=(feat_dim,))
self._gamma = np.ones(shape=(feat_dim,))


def batch_norm(self, x):
if self.training:
x_mean = x.mean(axis=0)
x_var = x.var(axis=0)
#对应running_mean的更新公式
self._running_mean = (1-self._momentum)*x_mean +self._momentum*self._running_mean
self._running_var = (1-self._momentum)*x_var + self._momentum*self._running_var
#对应论文中计算BN公式
x_hat = (x-x_mean)/np.sqrt(x_var+self._eps)
else:
x_hat = (x-self._running_mean)/np.sqrt(self._running_var+self._eps)
return self._gamma*x_hat + self._beta

Transformer结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads

self.query_fc = nn.Linear(embed_dim, embed_dim)
self.key_fc = nn.Linear(embed_dim, embed_dim)
self.value_fc = nn.Linear(embed_dim, embed_dim)
self.fc_out = nn.Linear(embed_dim, embed_dim)

def forward(self, query, key, value, mask=None):
batch_size = query.shape[0]

# Linearly project queries, keys, and values
Q = self.query_fc(query)
K = self.key_fc(key)
V = self.value_fc(value)

# Split the embedding into num_heads
Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
K = K.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
V = V.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

# Calculate the attention scores
scores = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))

if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))

# Apply softmax to get attention probabilities
attention_weights = F.softmax(scores, dim=-1)

# Apply dropout
attention_weights = F.dropout(attention_weights, p=0.1, training=self.training)

# Multiply the attention weights with the values
output = torch.matmul(attention_weights, V)

# Concatenate multi-heads and project
output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, embed_dim)
output = self.fc_out(output)

return output, attention_weights

class PositionwiseFeedforward(nn.Module):
def __init__(self, embed_dim, hidden_dim):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, embed_dim)

def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x

class EncoderLayer(nn.Module):
def __init__(self, embed_dim, num_heads, hidden_dim):
super(EncoderLayer, self).__init__()
self.multihead_attention = MultiHeadAttention(embed_dim, num_heads)
self.feed_forward = PositionwiseFeedforward(embed_dim, hidden_dim)
self.layer_norm1 = nn.LayerNorm(embed_dim)
self.layer_norm2 = nn.LayerNorm(embed_dim)

def forward(self, x, mask=None):
# Multi-Head Attention
residual = x
x, _ = self.multihead_attention(x, x, x, mask)
x = self.layer_norm1(x + residual)

# Feed Forward
residual = x
x = self.feed_forward(x)
x = self.layer_norm2(x + residual)

return x

class TransformerEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, hidden_dim):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.layers = nn.ModuleList([EncoderLayer(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)])

def forward(self, x, mask=None):
x = self.embedding(x)
for layer in self.layers:
x = layer(x, mask)
return x

]]>
diff --git a/search.xml b/search.xml index f54ed9d..df9e574 100644 --- a/search.xml +++ b/search.xml @@ -4262,7 +4262,7 @@ id="pytorch实现自注意力和多头注意力">pytorch实现自注意力和多

Batch Normalization

class MyBN:
def __init__(self, momentum=0.01, eps=1e-5, feat_dim=2):
self._running_mean = np.zeros(shape = (feat_dim,))
self._running_var = np.ones(shape = (fear_dim,))
self._momentum = momentum
#防止分母计算为0
self._eps = eps

#对应batch norm中需要更新beta 和 gamma, 采用pytorch文档中的初始化
self._beta = np.zeros(shape=(feat_dim,))
self._gamma = np.ones(shape=(feat_dim,))


def batch_norm(self, x):
if self.training:
x_mean = x.mean(axis=0)
x_var = x.var(axis=0)
#对应running_mean的更新公式
self._running_mean = (1-self._momentum)*x_mean +self._momentum*self._running_mean
self._running_var = (1-self._momentum)*x_var + self._momentum*self._running_var
#对应论文中计算BN公式
x_hat = (x-x_mean)/np.sqrt(x_var+self._eps)
else:
x_hat = (x-self._running_mean)/np.sqrt(self._running_var+self._eps)
return self._gamma*x_hat + self._beta

Transformer结构

-
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.depth = d_model // num_heads

self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)

self.dense = nn.Linear(d_model, d_model)

def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.depth)
return x.permute(0, 2, 1, 3)

def forward(self, q, k, v, mask=None):
batch_size = q.size(0)

q = self.wq(q)
k = self.wk(k)
v = self.wv(v)

q = self.split_heads(q, batch_size)
k = self.split_heads(k, batch_size)
v = self.split_heads(v, batch_size)

scaled_attention_logits = torch.matmul(q, k.transpose(-1, -2)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
if mask is not None:
scaled_attention_logits += (mask * -1e9)

attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
output = torch.matmul(attention_weights, v)

output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
output = self.dense(output)

return output, attention_weights

class PositionwiseFeedForward(nn.Module):
def __init__(self, d_model, dff):
super(PositionwiseFeedForward, self).__init__()
self.fc1 = nn.Linear(d_model, dff)
self.fc2 = nn.Linear(dff, d_model)

def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
return x

class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
super(TransformerBlock, self).__init__()

self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = PositionwiseFeedForward(d_model, dff)

self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)

self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)

self.dropout_rate = dropout_rate

def forward(self, x, mask=None):
attn_output, _ = self.mha(x, x, x, mask)
attn_output = self.dropout1(attn_output)
out1 = self.layernorm1(x + attn_output)

ffn_output = self.ffn(out1)

+
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads

self.query_fc = nn.Linear(embed_dim, embed_dim)
self.key_fc = nn.Linear(embed_dim, embed_dim)
self.value_fc = nn.Linear(embed_dim, embed_dim)
self.fc_out = nn.Linear(embed_dim, embed_dim)

def forward(self, query, key, value, mask=None):
batch_size = query.shape[0]

# Linearly project queries, keys, and values
Q = self.query_fc(query)
K = self.key_fc(key)
V = self.value_fc(value)

# Split the embedding into num_heads
Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
K = K.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
V = V.view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

# Calculate the attention scores
scores = torch.matmul(Q, K.permute(0, 1, 3, 2)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))

if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))

# Apply softmax to get attention probabilities
attention_weights = F.softmax(scores, dim=-1)

# Apply dropout
attention_weights = F.dropout(attention_weights, p=0.1, training=self.training)

# Multiply the attention weights with the values
output = torch.matmul(attention_weights, V)

# Concatenate multi-heads and project
output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, embed_dim)
output = self.fc_out(output)

return output, attention_weights

class PositionwiseFeedforward(nn.Module):
def __init__(self, embed_dim, hidden_dim):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(embed_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, embed_dim)

def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x

class EncoderLayer(nn.Module):
def __init__(self, embed_dim, num_heads, hidden_dim):
super(EncoderLayer, self).__init__()
self.multihead_attention = MultiHeadAttention(embed_dim, num_heads)
self.feed_forward = PositionwiseFeedforward(embed_dim, hidden_dim)
self.layer_norm1 = nn.LayerNorm(embed_dim)
self.layer_norm2 = nn.LayerNorm(embed_dim)

def forward(self, x, mask=None):
# Multi-Head Attention
residual = x
x, _ = self.multihead_attention(x, x, x, mask)
x = self.layer_norm1(x + residual)

# Feed Forward
residual = x
x = self.feed_forward(x)
x = self.layer_norm2(x + residual)

return x

class TransformerEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, num_layers, num_heads, hidden_dim):
super(TransformerEncoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.layers = nn.ModuleList([EncoderLayer(embed_dim, num_heads, hidden_dim) for _ in range(num_layers)])

def forward(self, x, mask=None):
x = self.embedding(x)
for layer in self.layers:
x = layer(x, mask)
return x

]]> 深度学习