Skip to content

Commit

Permalink
xlmroberta speedup
Browse files Browse the repository at this point in the history
  • Loading branch information
黄宇扬 committed Sep 24, 2024
1 parent 02de743 commit e883fc5
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 32 deletions.
2 changes: 0 additions & 2 deletions include/models/xlmroberta.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ namespace fastllm {
const Data &positionIds,
bool normalize);

void WarmUp(); // 预热

std::string model_type;

float layer_norm_eps = 1e-12;
Expand Down
7 changes: 5 additions & 2 deletions src/models/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ namespace fastllm {
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
MatMulTransB(q, k, qk, 1.0 / sqrt(this->head_dim), 1);
AttentionExtendedMask(qk, attentionMask);
std::vector <int> dims = qk.dims;
qk.Resize({dims[0], -1, dims[3]});
AttentionMask(qk, attentionMask, -1e9);
qk.Resize(dims);

Softmax(qk, qk, -1);
MatMul(qk, v, qkv, 1.0, 1);
Expand Down Expand Up @@ -150,7 +153,7 @@ namespace fastllm {
std::vector <float> ids = std::vector <float> (batch * len, 0.0f);
std::vector <float> seqLens = std::vector <float> (batch, 0.0f);
std::vector <float> token_type_ids = std::vector <float> (batch * len, 0.0f);
std::vector <float> attention_mask = std::vector <float> (batch * len, -1e10f);
std::vector <float> attention_mask = std::vector <float> (batch * len, 1);
std::vector <float> position_ids = std::vector <float> (batch * len, 0.0f);
for (int i = 0; i < batch; i++) {
seqLens[i] = tokens[i].size();
Expand Down
50 changes: 22 additions & 28 deletions src/models/xlmroberta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ namespace fastllm {
std::vector <float> ids = std::vector <float> (batch * len, 0.0f);
std::vector <float> seqLens = std::vector <float> (batch, 0.0f);
std::vector <float> token_type_ids = std::vector <float> (batch * len, 0.0f);
std::vector <float> attention_mask = std::vector <float> (batch * len, -1e10f);
std::vector <float> attention_mask = std::vector <float> (batch * len, 1);
std::vector <float> position_ids = std::vector <float> (batch * len, 0.0f);
for (int i = 0; i < batch; i++) {
seqLens[i] = tokens[i].size();
Expand All @@ -66,19 +66,6 @@ namespace fastllm {
positionIds.CopyFrom(fastllm::Data(fastllm::DataType::FLOAT32, {batch, len}, position_ids));
}

void Normalize__(float *data, int dataLen)
{
float sum = 0.0;
for(int i = 0; i < dataLen; i++)
sum += data[i] * data[i];

if (sum < 1e-6) sum = 1e-6;
else sum = sqrt(sum);

for(int i = 0; i < dataLen; i++)
data[i] = data[i] / sum;
}

std::vector <std::vector <float> > XlmRobertaModel::ForwardAll(
const Data &inputIds,
const Data &attentionMask,
Expand All @@ -93,6 +80,7 @@ namespace fastllm {
AddTo(inputEmbeddings, positionIdEmbeddings);
Data hiddenStates, firstStates;
LayerNorm(inputEmbeddings, this->weight["roberta.embeddings.LayerNorm.weight"], this->weight["roberta.embeddings.LayerNorm.bias"], -1, hiddenStates);
int bsz = hiddenStates.dims[0], seqlen = hiddenStates.dims[1];
Data q, k, v, qk, qkv, attnOutput, inter, pooler, logits;
for (int i = 0; i < this->block_cnt; i++) {
std::string queryWeightName = "roberta.encoder.layer." + std::to_string(i) + ".attention.self.query.weight";
Expand Down Expand Up @@ -123,15 +111,27 @@ namespace fastllm {
PermuteSelf(q, {0, 2, 1, 3});
PermuteSelf(k, {0, 2, 1, 3});
PermuteSelf(v, {0, 2, 1, 3});
MatMulTransB(q, k, qk, 1.0 / sqrt(this->head_dim), 1);
AttentionExtendedMask(qk, attentionMask);

Softmax(qk, qk, -1);
MatMul(qk, v, qkv, 1.0, 1);

PermuteSelf(qkv, {0, 2, 1, 3});
qkv.Reshape({qkv.dims[0], qkv.dims[1], -1});

if (bsz == 1) {
q.Reshape({-1, q.dims[2], q.dims[3]});
k.Reshape({-1, k.dims[2], k.dims[3]});
v.Reshape({-1, v.dims[2], v.dims[3]});
Attention(q, k, v, Data(), qkv, q.dims[0] / k.dims[0], 1.0 / sqrt(this->head_dim), 1);
PermuteSelf(qkv, {1, 0, 2});
qkv.Reshape({seqlen, bsz, -1});
PermuteSelf(qkv, {1, 0, 2});
} else {
MatMulTransB(q, k, qk, 1.0 / sqrt(this->head_dim), 1);
std::vector <int> dims = qk.dims;
qk.Reshape({dims[0], -1, dims[3]});
AttentionMask(qk, attentionMask, -1e9);
qk.Reshape(dims);
Softmax(qk, qk, -1);
MatMul(qk, v, qkv, 1.0, 1);
PermuteSelf(qkv, {0, 2, 1, 3});
qkv.Reshape({qkv.dims[0], qkv.dims[1], -1});
}

Linear(qkv, this->weight[attnOutputWeightName], this->weight[attnOutputbiasName], attnOutput);
AddTo(hiddenStates, attnOutput);
LayerNorm(hiddenStates, this->weight[attnLNWeightName], this->weight[attnLNbiasName], -1, hiddenStates);
Expand Down Expand Up @@ -171,10 +171,4 @@ namespace fastllm {
}
return ret;
}

void XlmRobertaModel::WarmUp() {
// printf("Warmup...\n");
// EmbeddingSentence({"1"});
// printf("finish.\n");
}
}

0 comments on commit e883fc5

Please sign in to comment.