Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update LLAMA tokenizer #1524

Merged
merged 14 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/gpu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
CONDA: "3"
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Keep alive
Expand All @@ -75,7 +75,7 @@ jobs:
CONDA: "3"
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down Expand Up @@ -151,7 +151,7 @@ jobs:
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
needs: gpu-ci-concierge
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down Expand Up @@ -239,7 +239,7 @@ jobs:
CONDA: "3"
needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
Expand Down
11 changes: 6 additions & 5 deletions cmake/nccl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
if(NCCL_VERSION VERSION_LESS 2.23)
set(NCCL_OLD TRUE)
else()
set(NCCL_OLD FALSE)
endif()
set(NCCL_OLD FALSE)
# if(NCCL_VERSION VERSION_LESS 2.23)
# set(NCCL_OLD TRUE)
# else()
# set(NCCL_OLD FALSE)
# endif()
message(STATUS "Found NCCL version: ${NCCL_VERSION}")
else()
message(WARNING "NCCL header not found, unable to determine version")
Expand Down
24 changes: 12 additions & 12 deletions docker/flexflow-environment/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,18 @@ ENV CUDA_DIR /usr/local/cuda
ARG FF_GPU_BACKEND "cuda"

# Update NCCL if FF_GPU_BACKEND is cuda
RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
ubuntu_version=$(lsb_release -rs); \
ubuntu_version=${ubuntu_version//./}; \
wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
rm -f cuda-keyring_1.0-1_all.deb; \
DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
else \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
fi'
# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
# ubuntu_version=$(lsb_release -rs); \
# ubuntu_version=${ubuntu_version//./}; \
# wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
# DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
# DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
# rm -f cuda-keyring_1.0-1_all.deb; \
# DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
# else \
# echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
# fi'

# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
# Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
Expand Down
11 changes: 8 additions & 3 deletions include/flexflow/ops/kernels/lora_linear_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
#include "flexflow/ops/lora_linear.h"

namespace FlexFlow {

using Legion::Context;
using Legion::Runtime;
struct LoraLinearWeight {
// weights
void *w0_ptr, *w1_ptr;
Expand Down Expand Up @@ -46,7 +47,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
BatchConfig const *bc,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
void peft_bwd_kernel_wrapper(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
GenericTensorAccessorW const &input_grad,
GenericTensorAccessorR const &output_grad);
Expand All @@ -63,7 +66,9 @@ void inference_kernel(LoraLinearMeta *m,
int out_dim,
ffStream_t stream);
template <typename DT>
void peft_bwd_kernel(LoraLinearMeta *m,
void peft_bwd_kernel(Context ctx,
Runtime *runtime,
LoraLinearMeta *m,
BatchConfig const *bc,
DT *input_grad_ptr,
DT const *output_grad_ptr,
Expand Down
11 changes: 8 additions & 3 deletions include/flexflow/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
#include "legion.h"

namespace FlexFlow {

using Legion::Context;
using Legion::Runtime;
class FFModel;
class OpMeta;

Expand Down Expand Up @@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(SGDOptimizer const *op,
static void nccl_update_task_gpu(Context ctx,
Runtime *runtime,
SGDOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
Expand Down Expand Up @@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(AdamOptimizer const *op,
static void nccl_update_task_gpu(Context ctx,
Runtime *runtime,
AdamOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
Expand Down
3 changes: 2 additions & 1 deletion include/flexflow/request_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ struct Request {
BatchConfig::RequestGuid guid;
PEFTModelID peft_model_id = PEFTModelID::NO_ID;
int max_length = -1;
int max_new_tokens = 128;
int max_new_tokens = -1;
int initial_len;
int ssm_cache_size = 0;
int llm_cache_size = 0;
Expand Down Expand Up @@ -302,6 +302,7 @@ class RequestManager {
ModelType model_type;
int bos_token_id;
int eos_token_id;
bool old_llama_tokenizer = false;
std::string output_filepath;
std::queue<Request> pending_infr_request_queue;
std::queue<Request> pending_peft_request_queue;
Expand Down
2 changes: 1 addition & 1 deletion inference/peft/peft.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
Request inference_req;
inference_req.prompt = text;
inference_req.max_length = 128;
inference_req.max_new_tokens = 128;
inference_req.peft_model_id =
(peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
requests.push_back(inference_req);
Expand Down
3 changes: 1 addition & 2 deletions inference/python/ff_peft.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def main():
ff.Request(
ff.RequestType.REQ_INFERENCE,
prompt=prompt,
max_sequence_length=128,
max_new_tokens=128,
peft_model_id=llm.get_ff_peft_id(lora_inference_config),
)
for prompt in prompts
Expand All @@ -172,7 +172,6 @@ def main():
if len(configs.finetuning_dataset) > 0:
finetuning_request = ff.Request(
ff.RequestType.REQ_FINETUNING,
max_sequence_length=128,
peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
dataset_filepath=configs.finetuning_dataset,
max_training_steps=2,
Expand Down
17 changes: 10 additions & 7 deletions inference/python/incr_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -71,6 +71,7 @@ def get_configs():
"full_precision": False,
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -106,9 +107,9 @@ def main():
max_seq_length=256,
max_tokens_per_batch=64,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
if "max_length" not in configs_dict:
Expand All @@ -119,8 +120,10 @@ def main():
if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)

result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


Expand Down
24 changes: 17 additions & 7 deletions inference/python/spec_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def get_configs():
"tensor_parallelism_degree": 1,
"pipeline_parallelism_degree": 2,
"offload": False,
"offload_reserve_space_size": 8 * 1024, # 8GB
"offload_reserve_space_size": 8 * 1024, # 8GB
"use_4bit_quantization": False,
"use_8bit_quantization": False,
"enable_peft": False,
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"peft_activation_reserve_space_size": 1024, # 1GB
"peft_weight_reserve_space_size": 1024, # 1GB
"profiling": False,
"benchmarking": False,
"inference_debugging": False,
Expand All @@ -81,6 +81,7 @@ def get_configs():
],
"prompt": "",
"output_file": "",
"max_length": 128,
}
# Merge dictionaries
ff_init_configs.update(llm_configs)
Expand Down Expand Up @@ -144,17 +145,26 @@ def main():
max_tokens_per_batch=64,
ssms=ssms,
)

llm.start_server()

if len(configs.prompt) > 0:
prompts = [s for s in json.load(open(configs.prompt))]
results = llm.generate(prompts)
if "max_length" not in configs_dict:
results = llm.generate(prompts)
else:
results = llm.generate(prompts, max_length=configs.max_length)
else:
result = llm.generate("Three tips for staying healthy are: ")

if "max_length" not in configs_dict:
result = llm.generate("Three tips for staying healthy are: ")
else:
result = llm.generate(
"Three tips for staying healthy are: ", max_length=configs.max_length
)

llm.stop_server()


if __name__ == "__main__":
print("flexflow inference example (speculative inference)")
main()
Loading
Loading