Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FlexLLM server demo #1510

Draft
wants to merge 41 commits into
base: inference
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
470a40f
init
goliaro Sep 25, 2024
7f23188
update
goliaro Sep 25, 2024
a2d2ac0
update
goliaro Sep 25, 2024
f8c90e6
update
goliaro Sep 25, 2024
2906e57
update
goliaro Sep 26, 2024
d62d9be
add max new tokens parameter
goliaro Oct 1, 2024
85797e0
backup
goliaro Oct 1, 2024
bb08d69
update
goliaro Oct 1, 2024
62275c2
backup
goliaro Oct 2, 2024
88d60ca
lora configs serialize / deserialize into single file
goliaro Oct 2, 2024
e453237
backup
goliaro Oct 4, 2024
5c8c448
.
goliaro Oct 5, 2024
21f8cb9
.
goliaro Oct 5, 2024
c5e813b
.
goliaro Oct 5, 2024
aa57f98
.
goliaro Oct 5, 2024
53c408c
frontend
goliaro Oct 5, 2024
1691100
bug fix
goliaro Oct 6, 2024
7ff96d7
fixes
goliaro Oct 6, 2024
7eb953a
Merge branch 'inference' into streamlit
goliaro Nov 4, 2024
92c2c37
fix
goliaro Nov 5, 2024
fbdf74e
updates
goliaro Nov 5, 2024
754abd7
Merge branch 'inference' into streamlit
goliaro Nov 5, 2024
10fb496
fix
goliaro Nov 5, 2024
79dc3a2
fix
goliaro Nov 5, 2024
4219806
fix
goliaro Nov 5, 2024
61f79ad
Merge branch 'inference' into streamlit
goliaro Nov 6, 2024
f542fbb
small fix
goliaro Nov 6, 2024
139b643
fix
goliaro Nov 7, 2024
b56ebd3
fix reset input grad for non-activated loras
goliaro Nov 8, 2024
3632754
fix
goliaro Nov 8, 2024
39e47a5
Merge branch 'inference' into streamlit
goliaro Nov 8, 2024
fca3d95
update
goliaro Nov 8, 2024
9a1eae5
demo fixes & readme
goliaro Nov 8, 2024
c71c6b3
load weights in parallel
goliaro Nov 9, 2024
d54fcf2
cleanup
goliaro Nov 9, 2024
f748515
cleanup
goliaro Nov 9, 2024
266a1ed
load weights faster in inference test
goliaro Nov 9, 2024
d771f6b
fix
goliaro Nov 9, 2024
fc626c6
cleanup and fixes
goliaro Nov 9, 2024
ab5aa4b
linting
goliaro Nov 9, 2024
7d99cf7
fix
goliaro Nov 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ RUN pip3 install transformers>=4.31.0 sentencepiece einops
RUN pip3 install tensorflow notebook
# PEFT-related
RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
RUN pip3 install streamlit

# Install Rust
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
Expand Down
7 changes: 6 additions & 1 deletion docker/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ hip_version=${hip_version:-"empty"}
ATTACH_GPUS=${ATTACH_GPUS:-true}
gpu_arg=""
if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
FORWARD_STREAMLIT_PORT=${FORWARD_STREAMLIT_PORT:-true}
port_forward_arg=""
if $FORWARD_STREAMLIT_PORT ; then
port_forward_arg+="-p 8501:8501"
fi


# Amount of shared memory to give the Docker container access to
Expand Down Expand Up @@ -120,4 +125,4 @@ if [ -f "$hf_token_path" ]; then
hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
fi

eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${port_forward_arg}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
12 changes: 11 additions & 1 deletion include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "legion.h"
#include <cstddef>
#include <cstdlib>
#include <cstring>

// #define MAX_SEQ_LEN 1024
// #define BATCH_SIZE 2
Expand All @@ -44,6 +45,11 @@ struct OptimizerTasks {
bool save_updated_weights = false;
};

struct NewPeftModelPath {
PEFTModelID peft_model_id;
std::string filepath;
};

void set_optimizer_tasks(OptimizerTasks &tasks,
int max_training_steps,
int completed_training_steps,
Expand Down Expand Up @@ -74,6 +80,7 @@ class BatchConfig {
static int const MAX_NUM_REQUESTS = 65;
static int const MAX_NUM_TOKENS = 1024;
static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
static int const MAX_PEFT_CONFIG_SIZE = 1024;

// Set by update

Expand All @@ -89,11 +96,12 @@ class BatchConfig {
num_tokens_in_batch = 0;
max_length = 0;
request_guid = 0;
peft_model_id = PEFTModelID::NO_ID;
prompt_phase = false;
batch_config_request_id = -1;
peft_model_id = PEFTModelID::NO_ID;
peft_bwd = false;
optimizer_tasks = {true, false, false, false};
std::memset(peft_model_config_str, 0, MAX_PEFT_CONFIG_SIZE);
}
int first_token_depth_in_request;
int first_token_offset_in_batch;
Expand All @@ -106,6 +114,7 @@ class BatchConfig {
RequestGuid request_guid;
// PEFT fields
PEFTModelID peft_model_id;
char peft_model_config_str[MAX_PEFT_CONFIG_SIZE];
bool peft_bwd;
OptimizerTasks optimizer_tasks;
};
Expand Down Expand Up @@ -135,6 +144,7 @@ class BatchConfig {
PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
NewPeftModelPath new_peft_model_paths[MAX_NUM_REQUESTS];

bool request_completed[MAX_NUM_REQUESTS];
bool request_running[MAX_NUM_REQUESTS];
Expand Down
4 changes: 0 additions & 4 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,6 @@ struct FFHandler {
// PEFT related fields
MemoryAllocator *peft_activation_allocator;
size_t peft_activation_reserve_space_size;
PEFTWeightAllocator *peft_weight_allocator;
size_t peft_weight_reserve_space_size;
// Quantization fields
DataType quantization_type;
bool allowTensorOpMathConversion;
Expand All @@ -118,7 +116,6 @@ struct FFInitInfo {
size_t workSpaceSize;
size_t offload_reserve_space_size;
size_t peft_activation_reserve_space_size;
size_t peft_weight_reserve_space_size;
DataType quantization_type;
bool allowTensorOpMathConversion;
// int myRank, allRanks;
Expand Down Expand Up @@ -179,7 +176,6 @@ class FFConfig {
// PEFT related fields
bool enable_peft;
size_t peft_activation_reserve_space_size;
size_t peft_weight_reserve_space_size;
// Control parallelizable dimensions
bool only_data_parallel;
bool enable_sample_parallel;
Expand Down
1 change: 1 addition & 0 deletions include/flexflow/fftype.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class PEFTModelID {
PEFTModelID(size_t id);
bool is_valid_id() const;
friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
friend bool operator!=(PEFTModelID const &lhs, PEFTModelID const &rhs);
friend std::ostream &operator<<(std::ostream &os,
PEFTModelID const &peft_model_id);

Expand Down
11 changes: 10 additions & 1 deletion include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);

int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);

bool flexflow_config_get_enable_peft(flexflow_config_t handle_);

void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
int value);

Expand Down Expand Up @@ -622,7 +624,11 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
bool beam_search,
char const *name);

flexflow_peft_model_id_t flexflow_model_add_lora_layer(
void flexflow_model_add_lora_layers(flexflow_model_t handle_,
int num_target_modules,
char const **target_modules_);

flexflow_peft_model_id_t flexflow_model_register_peft_adapter(
flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);

void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
Expand Down Expand Up @@ -1023,6 +1029,9 @@ void flexflow_request_manager_set_max_sequence_length(
int flexflow_request_manager_get_max_sequence_length(
flexflow_request_manager_t handle_);

void flexflow_request_manager_set_max_concurrent_adapters(
flexflow_request_manager_t handle_, int max_concurrent_adapters);

void flexflow_request_manager_set_enable_peft_finetuning(
flexflow_request_manager_t handle_, bool enable_peft_finetuning_);

Expand Down
11 changes: 7 additions & 4 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ enum TaskIDs {
RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
RM_BACKGROUND_SERVING_TASK_ID,
LOAD_WEIGHT_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
CUSTOM_GPU_TASK_ID_1,
Expand Down Expand Up @@ -835,7 +836,9 @@ class FFModel {
// ========================================
// PEFT Layers
// ========================================
PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
// PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
void add_lora_layers(std::vector<std::string> target_modules);
PEFTModelID *register_peft_adapter(LoraLinearConfig const &peft_config);
// ========================================
// Inference APIs
// ========================================
Expand Down Expand Up @@ -1170,9 +1173,9 @@ class FFModel {
std::vector<ParallelTensor> parameters;
// PEFT related
std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
// std::vector<Op *> peft_operators;
// std::unordered_map<Layer *, std::vector<PEFTModelID>>
// peft_layer_to_peft_id; std::unordered_map<PEFTModelID, LoraLinearConfig>
// peft_configs; std::vector<Op *> peft_operators;

FFHandler handlers[MAX_NUM_WORKERS];
Legion::Future current_metrics;
Expand Down
2 changes: 1 addition & 1 deletion include/flexflow/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ class Op {
// get operator name and print it
std::string op_name_without_uid = get_op_name_without_uid(m);
std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
<< std::endl;
<< (before_kernel ? " (before kernel)" : "") << std::endl;
// build the path to save the tensor
fs::path dst_filepath;
if (fwd_pass) {
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/ops/kernels/linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ void inference_kernel_wrapper(LinearMeta *m,
int out_dim,
int batch_size);
void peft_bwd_kernel_wrapper(LinearMeta const *m,
BatchConfig const *bc,
void *input_grad_ptr,
void *output_grad_ptr,
void const *kernel_ptr,
Expand Down Expand Up @@ -94,6 +95,7 @@ void forward_kernel(LinearMeta const *m,
ffStream_t stream);
template <typename DT>
void peft_bwd_kernel(LinearMeta const *m,
BatchConfig const *bc,
void *input_grad_ptr,
void *output_grad_ptr,
void const *kernel_ptr,
Expand Down
38 changes: 12 additions & 26 deletions include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,27 @@
#include "flexflow/fftype.h"
#include "flexflow/op_meta.h"
#include "flexflow/ops/lora_linear.h"
#include "flexflow/utils/peft_weight_allocator.h"

namespace FlexFlow {

using Legion::Context;
using Legion::Runtime;
struct LoraLinearWeight {
// weights
void *w0_ptr, *w1_ptr;
// gradients
void *w0_grad_ptr, *w1_grad_ptr;
// v values for SGD optimizer (when using momentum)
void *w0_v_values_ptr, *w1_v_values_ptr;
int in_dim, out_dim, rank, num_shards;
};

struct LoraLinearModelState {
LoraLinearWeight weights;
LoraOptimizerConfig const *optimizer_config;
float lora_alpha;
std::string cache_folder;
// Huggingface model ID (for download and/or upload)
std::string peft_model_id;
};

class LoraLinearMeta : public OpMeta {
public:
LoraLinearMeta(FFHandler handle, LoraLinear const *li);
~LoraLinearMeta(void);
// PEFT related fields
void *low_rank_activation;
void *input_activation;
std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
PEFTMemoryManager *peft_memory_manager;
};

namespace Kernels {
namespace LoraLinear {
void init_kernel_wrapper(LoraLinearMeta *m, int seed);

bool lora_applies_to_this_layer(LoraLinearMeta *m,
LoraLinearConfig const &config);

// void init_kernel_wrapper(LoraLinearMeta *m, int seed);
void inference_kernel_wrapper(LoraLinearMeta *m,
BatchConfig const *bc,
GenericTensorAccessorR const &input,
Expand All @@ -51,12 +35,13 @@ void peft_bwd_kernel_wrapper(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
int shard_id,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output_grad);

namespace Internal {
template <typename DT>
void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
// template <typename DT>
// void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
template <typename DT>
void inference_kernel(LoraLinearMeta *m,
BatchConfig const *bc,
Expand All @@ -70,6 +55,7 @@ void peft_bwd_kernel(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
int shard_id,
DT *input_grad_ptr,
DT const *output_grad_ptr,
int in_dim,
Expand Down
19 changes: 10 additions & 9 deletions include/flexflow/ops/lora_linear.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ class LoraLinear : public Op {
using Params = LoraLinearParams;
using Input = std::pair<ParallelTensor, ParallelTensor>;

LoraLinear(
FFModel &model,
LayerID const &layer_guid,
OperatorType type,
ParallelTensor const input,
ParallelTensor const output,
std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
char const *name = nullptr);
LoraLinear(FFModel &model,
LayerID const &layer_guid,
ParallelTensor const input,
ParallelTensor const output,
int max_rank,
int max_concurrent_adapters,
char const *name = nullptr);
LoraLinear(FFModel &model,
LoraLinear const &other,
ParallelTensor const input,
Expand Down Expand Up @@ -91,7 +90,9 @@ class LoraLinear : public Op {
// size_t get_params_hash() const override;
LoraLinearParams get_params() const;

std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
// std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
int max_rank;
int max_concurrent_adapters;
};

}; // namespace FlexFlow
Expand Down
Loading
Loading