Skip to content

Commit

Permalink
Update LLAMA tokenizer (#1524)
Browse files Browse the repository at this point in the history
* fix tokenizer conversion

* update

* update

* update

* fix

* fix

* lint

* simplify api

* fix

* fix

* fix

* update to 12.1 (#1512)

* fix deadlock?

* remove barrier where not strictly needed

---------

Co-authored-by: zhihao <email>
  • Loading branch information
sfc-gh-goliaro committed Oct 22, 2024
1 parent 13615f4 commit 5a0c1ca
Show file tree
Hide file tree
Showing 40 changed files with 769 additions and 289 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,5 @@ python/flexflow/version.txt

inference_tensors
tests/inference/python_test_configs/*.json

core.*
36 changes: 36 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -466,6 +472,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -487,6 +499,12 @@ flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -508,6 +526,12 @@ flexflow_tensor_t flexflow_model_add_groupquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -530,6 +554,12 @@ flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand All @@ -552,6 +582,12 @@ flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
enum DataType data_type,
flexflow_initializer_t kernel_initializer_,
bool apply_rotary_embedding,
float rope_theta,
char const *rope_type,
float rope_factor,
float low_freq_factor,
float high_freq_factor,
int original_max_position_embeddings,
bool scaling_query,
float scaling_factor,
bool qk_prod_scaling,
Expand Down
39 changes: 37 additions & 2 deletions include/flexflow/inference.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,43 @@ class TraceEmissionMachine : public EmissionMachine {
double sample_slo_ratio() override;
};

#include <string>
#include <vector>
struct RotaryEmbeddingMeta {
bool apply_rotary_embedding = false;
float rope_theta = 10000.0f;
std::string rope_type = "default";
float factor = 8.0f;
float low_freq_factor = 1.0f;
float high_freq_factor = 4.0f;
int original_max_position_embeddings = 8192;

RotaryEmbeddingMeta(bool apply_rotary_embedding_ = false,
float rope_theta_ = 10000.0f,
std::string rope_type_ = "default",
float factor_ = 8.0f,
float low_freq_factor_ = 1.0f,
float high_freq_factor_ = 4.0f,
int original_max_position_embeddings_ = 8192)
: apply_rotary_embedding(apply_rotary_embedding_),
rope_theta(rope_theta_), rope_type(rope_type_), factor(factor_),
low_freq_factor(low_freq_factor_), high_freq_factor(high_freq_factor_),
original_max_position_embeddings(original_max_position_embeddings_) {}

friend std::ostream &operator<<(std::ostream &os,
RotaryEmbeddingMeta const &meta) {
os << std::boolalpha // To print bool as true/false instead of 1/0
<< "RotaryEmbeddingMeta {\n"
<< " apply_rotary_embedding: " << meta.apply_rotary_embedding << ",\n"
<< " rope_theta: " << meta.rope_theta << ",\n"
<< " rope_type: \"" << meta.rope_type << "\",\n"
<< " factor: " << meta.factor << ",\n"
<< " low_freq_factor: " << meta.low_freq_factor << ",\n"
<< " high_freq_factor: " << meta.high_freq_factor << ",\n"
<< " original_max_position_embeddings: "
<< meta.original_max_position_embeddings << "\n"
<< "}";
return os;
}
};

std::string join_path(std::vector<std::string> const &paths);

Expand Down
3 changes: 3 additions & 0 deletions include/flexflow/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@ class Layer {
void add_float_property(std::string const &key, float value);
void add_int_vector_property(std::string const &key,
std::vector<int> const &value);
void add_string_property(std::string const &key, std::string const &value);
void add_initializer(std::string const &key, Initializer *initializer);
bool get_int_property(std::string const &key, long long &value) const;
bool get_float_property(std::string const &key, float &value) const;
bool get_int_vector_property(std::string const &key,
std::vector<int> &value) const;
bool get_string_property(std::string const &key, std::string &value) const;
bool get_initializer(std::string const &key, Initializer *&initializer) const;
Tensor get_parameter(int index);
void print();
Expand All @@ -59,6 +61,7 @@ class Layer {
std::unordered_map<std::string, float> float_properties;
std::unordered_map<std::string, Initializer *> initializers;
std::unordered_map<std::string, std::vector<int>> int_vector_properties;
std::unordered_map<std::string, std::string> string_properties;
};

}; // namespace FlexFlow
83 changes: 42 additions & 41 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -709,43 +709,44 @@ class FFModel {
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
char const *name = NULL);
Tensor inc_multihead_self_attention(Tensor const input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
bool streaming_cache = false,
char const *name = NULL);
Tensor
spec_inc_multihead_self_attention(Tensor const input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
bool streaming_cache = false,
char const *name = NULL);
Tensor inc_multihead_self_attention(
const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
bool streaming_cache = false,
char const *name = NULL);
Tensor spec_inc_multihead_self_attention(
const Tensor input,
int embed_dim,
int num_heads,
int kdim = 0,
int vdim = 0,
float dropout = 0.0f,
bool bias = false,
bool add_bias_kv = false,
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
bool position_bias = false,
bool streaming_cache = false,
char const *name = NULL);
Tensor inc_multihead_self_attention_verify(
Tensor const input,
int embed_dim,
Expand All @@ -758,7 +759,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -776,7 +777,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -796,7 +797,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand All @@ -816,7 +817,7 @@ class FFModel {
bool add_zero_attn = false,
DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
bool apply_rotary_embedding = false,
RotaryEmbeddingMeta rotary_embedding_meta = RotaryEmbeddingMeta(),
bool scaling_query = false,
float scaling_factor = 1.0f,
bool qk_prod_scaling = true,
Expand Down
16 changes: 14 additions & 2 deletions include/flexflow/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,20 @@ class Op {
assert(false && "Tensor data type not supported");
}
}
// only dump the weights once
if (m->decoding_step == 0) {

// only dump the weights in the forward pass, at the first step
// note that we do not save the weight gradients, since we only support
// finetuning LoRA weights, which are not FF tensors.
// Set FF_DEBG_NO_WEIGHTS=1 or to FF_DEBG_NO_WEIGHTS=true to disable saving
// weights
bool do_not_save_weights =
(std::getenv("FF_DEBG_NO_WEIGHTS") &&
(std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "1" ||
std::string(std::getenv("FF_DEBG_NO_WEIGHTS")) == "true"));
if (fwd_pass && m->decoding_step == 0 && !do_not_save_weights) {
fs::path dst_filepath_weights =
get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
layername;
for (int i = 0; i < weight_tensors.size(); i++) {
std::string filename = base_filepath + "_weight_" + std::to_string(i);
if (weight_tensors[i].data_type == DT_FLOAT) {
Expand Down
12 changes: 6 additions & 6 deletions include/flexflow/ops/inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -63,7 +63,7 @@ class IncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -129,8 +129,8 @@ class IncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int hidden_size, qk_dim, v_dim, o_dim;
int qoSeqLength, kvSeqLength;
DataType quantization_type;
Expand All @@ -153,7 +153,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int _qk_dim,
int _v_dim,
int _o_dim,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _qkv_bias,
bool _scaling_query,
bool _qk_prod_scaling,
Expand All @@ -180,7 +180,7 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
int global_num_q_heads, global_num_kv_heads, num_q_heads, num_kv_heads,
local_hidden_size;
bool *has_load_weights;
bool *apply_rotary_embedding;
RotaryEmbeddingMeta *rotary_embedding_meta;
bool *qkv_bias;
bool *final_bias;
bool *scaling_query;
Expand Down
6 changes: 4 additions & 2 deletions include/flexflow/ops/inc_multihead_self_attention_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "flexflow/ffconst.h"
#include "flexflow/fftype.h"
#include "flexflow/inference.h"
#include "flexflow/parallel_tensor.h"

namespace FlexFlow {
Expand All @@ -12,8 +13,9 @@ struct IncMultiHeadSelfAttentionParams {
int embed_dim, num_q_heads, kdim, vdim, num_kv_heads,
tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
scaling_query, qk_prod_scaling, position_bias;
bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling,
position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
DataType quantization_type;
bool offload, streaming_cache;
char name[MAX_OPNAME];
Expand Down
8 changes: 4 additions & 4 deletions include/flexflow/ops/spec_inc_multihead_self_attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand All @@ -56,7 +56,7 @@ class SpecIncMultiHeadSelfAttention : public Op {
bool _qkv_bias,
bool _final_bias,
bool _add_zero_attn,
bool _apply_rotary_embedding,
RotaryEmbeddingMeta _rotary_embedding_meta,
bool _scaling_query,
float _scaling_factor,
bool _qk_prod_scaling,
Expand Down Expand Up @@ -122,8 +122,8 @@ class SpecIncMultiHeadSelfAttention : public Op {
int num_q_heads, num_kv_heads, tensor_parallelism_degree;
float dropout, scaling_factor;
bool qkv_bias;
bool final_bias, add_zero_attn, apply_rotary_embedding, scaling_query,
qk_prod_scaling, position_bias;
bool final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
int hidden_size, qk_dim, v_dim, o_dim;
int qoSeqLength, kvSeqLength;
bool streaming_cache;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ struct SpecIncMultiHeadSelfAttentionParams {
LayerID layer_guid;
int embed_dim, num_q_heads, num_kv_heads, kdim, vdim;
float dropout, scaling_factor;
bool qkv_bias, final_bias, add_zero_attn, apply_rotary_embedding,
scaling_query, qk_prod_scaling, position_bias;
bool qkv_bias, final_bias, add_zero_attn, scaling_query, qk_prod_scaling, position_bias;
RotaryEmbeddingMeta rotary_embedding_meta;
bool streaming_cache;
char name[MAX_OPNAME];
bool is_valid(ParallelTensorShape const &) const;
Expand Down
Loading

0 comments on commit 5a0c1ca

Please sign in to comment.