Skip to content

Commit

Permalink
#4003: added optimized bloom
Browse files Browse the repository at this point in the history
  • Loading branch information
arakhmati committed Dec 5, 2023
1 parent 6abdc62 commit 2ab4da8
Show file tree
Hide file tree
Showing 29 changed files with 2,057 additions and 1,026 deletions.
2 changes: 1 addition & 1 deletion docs/source/ttnn/operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ Operations
:maxdepth: 1

operations/add
operations/deallocate
operations/embedding
operations/free
operations/from_device
operations/from_torch
operations/matmul
Expand Down
4 changes: 4 additions & 0 deletions docs/source/ttnn/operations/deallocate.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TTNN.DEALLOCATE
===============

.. autofunction:: ttnn.deallocate
4 changes: 0 additions & 4 deletions docs/source/ttnn/operations/free.rst

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def ttnn_optimized_multi_head_attention(
memory_config=ttnn.L1_MEMORY_CONFIG,
core_grid=(batch_size, num_cores_x),
)
ttnn.free(fused_qkv_output)
ttnn.deallocate(fused_qkv_output)

attention_scores = ttnn.matmul(
query,
Expand All @@ -45,8 +45,8 @@ def ttnn_optimized_multi_head_attention(
dtype=ttnn.bfloat16,
core_grid=(batch_size, num_cores_x),
)
ttnn.free(query)
ttnn.free(key)
ttnn.deallocate(query)
ttnn.deallocate(key)

attention_probs = ttnn.nlp.attention_softmax_(attention_scores, attention_mask=attention_mask, head_size=head_size)

Expand All @@ -57,12 +57,12 @@ def ttnn_optimized_multi_head_attention(
dtype=ttnn.bfloat8_b,
core_grid=(batch_size, num_cores_x),
)
ttnn.free(attention_probs)
ttnn.deallocate(attention_probs)
ttnn.deallocate(value)

context_layer = ttnn.nlp.concatenate_heads(
context_layer,
memory_config=ttnn.L1_MEMORY_CONFIG,
core_grid=(batch_size, num_cores_x),
)

self_output = ttnn.linear(
Expand All @@ -73,7 +73,7 @@ def ttnn_optimized_multi_head_attention(
dtype=ttnn.bfloat16,
core_grid=(batch_size, num_cores_x),
)
ttnn.free(context_layer)
ttnn.deallocate(context_layer)

return self_output

Expand All @@ -100,6 +100,7 @@ def ttnn_optimized_feedforward(hidden_states, ff1_weight, ff1_bias, ff2_weight,
dtype=ttnn.bfloat16,
core_grid=(batch_size, num_cores_x),
)
ttnn.deallocate(ff1_output)

return ff2_output

Expand Down Expand Up @@ -129,8 +130,8 @@ def ttnn_optimized_bert_encoder(
bias=parameters[f"bert.encoder.layer.{encoder_index}.attention.output.LayerNorm.bias"],
memory_config=ttnn.L1_MEMORY_CONFIG,
)
ttnn.free(hidden_states)
ttnn.free(multi_head_attention_output)
ttnn.deallocate(hidden_states)
ttnn.deallocate(multi_head_attention_output)

feedforward_output = ttnn_optimized_feedforward(
multi_head_attention_add_and_layer_norm_output,
Expand All @@ -147,8 +148,8 @@ def ttnn_optimized_bert_encoder(
bias=parameters[f"bert.encoder.layer.{encoder_index}.output.LayerNorm.bias"],
memory_config=ttnn.L1_MEMORY_CONFIG,
)
ttnn.free(multi_head_attention_add_and_layer_norm_output)
ttnn.free(feedforward_output)
ttnn.deallocate(multi_head_attention_add_and_layer_norm_output)
ttnn.deallocate(feedforward_output)

return feedforward_add_and_layer_norm_output

Expand All @@ -165,40 +166,43 @@ def ttnn_optimized_bert(
import tt_lib as ttl

word_embeddings = ttnn.embedding(
input_ids, parameters["bert.embeddings.word_embeddings.weight"], layout=ttnn.TILE_LAYOUT
input_ids,
parameters["bert.embeddings.word_embeddings.weight"],
layout=ttnn.TILE_LAYOUT,
)
ttnn.free(input_ids)
ttnn.deallocate(input_ids)

token_type_embeddings = ttnn.embedding(
token_type_ids, parameters["bert.embeddings.token_type_embeddings.weight"], layout=ttnn.TILE_LAYOUT
token_type_ids,
parameters["bert.embeddings.token_type_embeddings.weight"],
layout=ttnn.TILE_LAYOUT,
)
ttnn.free(token_type_ids)
ttnn.deallocate(token_type_ids)

embeddings = word_embeddings + token_type_embeddings
ttnn.free(word_embeddings)
ttnn.free(token_type_embeddings)
ttnn.deallocate(word_embeddings)
ttnn.deallocate(token_type_embeddings)

encoder_input = ttnn.experimental.layer_norm(
embeddings,
weight=parameters[f"bert.embeddings.LayerNorm.weight"],
bias=parameters[f"bert.embeddings.LayerNorm.bias"],
memory_config=ttnn.L1_MEMORY_CONFIG,
)
ttnn.free(embeddings)
ttnn.deallocate(embeddings)

encoder_output = None
for encoder_index in range(num_encoders):
encoder_input = ttnn.Tensor(ttl.tensor.move(encoder_input._tensor))
encoder_output = ttnn_optimized_bert_encoder(
encoder_input,
attention_mask,
parameters,
encoder_index=encoder_index,
head_size=head_size,
)
encoder_output = ttnn.reallocate(encoder_output)
encoder_input = encoder_output

encoder_output = ttnn.Tensor(ttl.tensor.move(encoder_output._tensor))
return encoder_output


Expand Down
Loading

0 comments on commit 2ab4da8

Please sign in to comment.