Skip to content

Commit

Permalink
backend: rebase llama.cpp on upstream as of Sep 26th (#2998)
Browse files Browse the repository at this point in the history
Signed-off-by: Jared Van Bortel <[email protected]>
  • Loading branch information
cebtenzzre authored Sep 27, 2024
1 parent 8bd937e commit f9d6be8
Show file tree
Hide file tree
Showing 16 changed files with 166 additions and 601 deletions.
2 changes: 1 addition & 1 deletion gpt4all-backend/deps/llama.cpp-mainline
10 changes: 6 additions & 4 deletions gpt4all-backend/include/gpt4all-backend/llmodel.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cstdint>
#include <functional>
#include <optional>
#include <span>
#include <stdexcept>
#include <string>
#include <string_view>
Expand Down Expand Up @@ -149,9 +150,9 @@ class LLModel {
virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
virtual bool isModelLoaded() const = 0;
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
virtual size_t stateSize() const { return 0; }
virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
virtual size_t stateSize() const = 0;
virtual size_t saveState(std::span<uint8_t> dest) const = 0;
virtual size_t restoreState(std::span<const uint8_t> src) = 0;

// This method requires the model to return true from supportsCompletion otherwise it will throw
// an error
Expand Down Expand Up @@ -215,7 +216,8 @@ class LLModel {
virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
virtual bool isSpecialToken(Token id) const = 0;
virtual std::string tokenToString(Token id) const = 0;
virtual Token sampleToken(PromptContext &ctx) const = 0;
virtual void initSampler(PromptContext &ctx) = 0;
virtual Token sampleToken() const = 0;
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
virtual void shiftContext(PromptContext &promptCtx) = 0;
virtual int32_t contextLength() const = 0;
Expand Down
12 changes: 7 additions & 5 deletions gpt4all-backend/include/gpt4all-backend/llmodel_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
* NOTE: This state data is specific to the type of model you have created.
* @param model A pointer to the llmodel_model instance.
* @param dest A pointer to the destination.
* @return the number of bytes copied
* @param size The size of the destination buffer.
* @return the number of bytes copied, or zero on error.
*/
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);

/**
* Restores the internal state of the model using data from the specified address.
* NOTE: This state data is specific to the type of model you have created.
* @param model A pointer to the llmodel_model instance.
* @param src A pointer to the src.
* @return the number of bytes read
* @param src A pointer to the state data.
* @param size The size of the source data.
* @return The number of bytes read, or zero on error.
*/
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);

/**
* Generate a response using the model.
Expand Down
7 changes: 5 additions & 2 deletions gpt4all-backend/llama.cpp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -978,10 +978,13 @@ function(include_ggml SUFFIX)

add_library(llama${SUFFIX} STATIC
${DIRECTORY}/include/llama.h
${DIRECTORY}/src/llama-grammar.cpp
${DIRECTORY}/src/llama-sampling.cpp
${DIRECTORY}/src/llama-vocab.cpp
${DIRECTORY}/src/llama.cpp
${DIRECTORY}/src/unicode.h
${DIRECTORY}/src/unicode.cpp
${DIRECTORY}/src/unicode-data.cpp
${DIRECTORY}/src/unicode.cpp
${DIRECTORY}/src/unicode.h
)

target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
Expand Down
174 changes: 93 additions & 81 deletions gpt4all-backend/src/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "llamamodel_impl.h"

#include "llmodel.h"
#include "utils.h"

#include <ggml.h>
#include <llama.h>
Expand Down Expand Up @@ -103,26 +104,34 @@ static bool llama_verbose()
return var && *var;
}

static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
fputs(text, stderr);
}
}

#ifdef GGML_USE_CUDA
static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
{
(void)userdata;
if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
fputs(text, stderr);
static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
if (!llama_verbose()) {
auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
lastlevel = efflevel;
switch (efflevel) {
case GGML_LOG_LEVEL_CONT:
UNREACHABLE();
break;
case GGML_LOG_LEVEL_WARN:
if (warn) break;
[[fallthrough]];
case GGML_LOG_LEVEL_NONE: // not used?
case GGML_LOG_LEVEL_INFO:
case GGML_LOG_LEVEL_DEBUG:
return; // suppress
case GGML_LOG_LEVEL_ERROR:
;
}
}

fputs(text, stderr);
}
#endif

struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt

// sampling parameters
Expand All @@ -137,44 +146,6 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
};

static llama_token llama_sample_top_p_top_k(
llama_context *ctx,
const llama_token *last_n_tokens_data,
int last_n_tokens_size,
int top_k,
float top_p,
float min_p,
float temp,
float repeat_penalty) {
auto logits = llama_get_logits_ith(ctx, -1);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Populate initial list of all candidates
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (int token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
// Sample repeat penalty
llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);

llama_token id;
if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
// temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_min_p(ctx, &candidates_p, min_p, 1);
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
return id;
}

const char *get_arch_name(gguf_context *ctx_gguf)
{
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
Expand Down Expand Up @@ -241,21 +212,26 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
}

struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
llama_context_params ctx_params;
int64_t n_threads = 0;
std::vector<LLModel::Token> end_tokens;
const char *backend_name = nullptr;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
int64_t n_threads = 0;
std::vector<LLModel::Token> end_tokens;
const char *backend_name = nullptr;

llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
llama_context_params ctx_params;
llama_sampler *sampler_chain;
};

LLamaModel::LLamaModel()
: d_ptr(new LLamaPrivate) {}
: d_ptr(std::make_unique<LLamaPrivate>())
{
auto sparams = llama_sampler_chain_default_params();
d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
}

// default hparams (LLaMA 7B)
struct llama_file_hparams {
Expand Down Expand Up @@ -444,10 +420,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
}
}

d_ptr->ctx_params.n_ctx = n_ctx;
d_ptr->ctx_params.seed = params.seed;
d_ptr->ctx_params.type_k = params.kv_type;
d_ptr->ctx_params.type_v = params.kv_type;
d_ptr->ctx_params.n_ctx = n_ctx;
d_ptr->ctx_params.type_k = params.kv_type;
d_ptr->ctx_params.type_v = params.kv_type;

// The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
// that we want this many logits so the state serializes consistently.
Expand Down Expand Up @@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
llama_free(d_ptr->ctx);
}
llama_free_model(d_ptr->model);
llama_sampler_free(d_ptr->sampler_chain);
}

bool LLamaModel::isModelLoaded() const
Expand All @@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const

size_t LLamaModel::stateSize() const
{
return llama_get_state_size(d_ptr->ctx);
return llama_state_get_size(d_ptr->ctx);
}

size_t LLamaModel::saveState(uint8_t *dest) const
size_t LLamaModel::saveState(std::span<uint8_t> dest) const
{
return llama_copy_state_data(d_ptr->ctx, dest);
return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
}

size_t LLamaModel::restoreState(const uint8_t *src)
size_t LLamaModel::restoreState(std::span<const uint8_t> src)
{
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
}

std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
Expand Down Expand Up @@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
return std::string(result.data(), result.size());
}

LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
void LLamaModel::initSampler(PromptContext &promptCtx)
{
auto *model = d_ptr->model;
auto *chain = d_ptr->sampler_chain;

// clear sampler chain
for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
auto *smpl = llama_sampler_chain_remove(chain, i);
llama_sampler_free(smpl);
}

// build new chain
llama_sampler_chain_add(chain,
llama_sampler_init_penalties(
llama_n_vocab(model),
llama_token_eos(model),
llama_token_nl(model),
promptCtx.repeat_last_n,
promptCtx.repeat_penalty,
// TODO(jared): consider making the below configurable
/*penalty_freq*/ 0.0f,
/*penalty_present*/ 0.0f,
/*penalize_nl*/ true,
/*ignore_eos*/ false
)
);
if (promptCtx.temp == 0.0f) {
llama_sampler_chain_add(chain, llama_sampler_init_greedy());
} else {
struct llama_sampler *samplers[] = {
llama_sampler_init_top_k(promptCtx.top_k),
llama_sampler_init_top_p(promptCtx.top_p, 1),
llama_sampler_init_min_p(promptCtx.min_p, 1),
llama_sampler_init_temp(promptCtx.temp),
llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
};
for (auto *smpl : samplers)
llama_sampler_chain_add(chain, smpl);
}
}

LLModel::Token LLamaModel::sampleToken() const
{
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
return llama_sample_top_p_top_k(d_ptr->ctx,
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
promptCtx.repeat_penalty);
return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
}

bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
Expand Down Expand Up @@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)

DLL_EXPORT LLModel *construct()
{
llama_log_set(llama_log_callback, nullptr);
llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
#ifdef GGML_USE_CUDA
ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
#endif
return new LLamaModel;
}
Expand Down
8 changes: 5 additions & 3 deletions gpt4all-backend/src/llamamodel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "llmodel.h"

#include <memory>
#include <span>
#include <string>
#include <string_view>
#include <vector>
Expand All @@ -27,8 +28,8 @@ class LLamaModel : public LLModel {
bool isModelLoaded() const override;
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
size_t stateSize() const override;
size_t saveState(uint8_t *dest) const override;
size_t restoreState(const uint8_t *src) override;
size_t saveState(std::span<uint8_t> dest) const override;
size_t restoreState(std::span<const uint8_t> src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
Expand Down Expand Up @@ -56,7 +57,8 @@ class LLamaModel : public LLModel {
std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
bool isSpecialToken(Token id) const override;
std::string tokenToString(Token id) const override;
Token sampleToken(PromptContext &ctx) const override;
void initSampler(PromptContext &ctx) override;
Token sampleToken() const override;
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
void shiftContext(PromptContext &promptCtx) override;
int32_t contextLength() const override;
Expand Down
8 changes: 4 additions & 4 deletions gpt4all-backend/src/llmodel_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
return wrapper->llModel->stateSize();
}

uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
{
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->saveState(dest);
return wrapper->llModel->saveState({dest, size_t(size)});
}

uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
{
auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->restoreState(src);
return wrapper->llModel->restoreState({src, size_t(size)});
}

void llmodel_prompt(llmodel_model model, const char *prompt,
Expand Down
4 changes: 3 additions & 1 deletion gpt4all-backend/src/llmodel_shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,14 +244,16 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
return;
}

initSampler(promptCtx);

std::string cachedResponse;
std::vector<Token> cachedTokens;
int n_predicted = 0;

// Predict next tokens
for (bool stop = false; !stop;) {
// Sample next token
std::optional<Token> new_tok = sampleToken(promptCtx);
std::optional<Token> new_tok = sampleToken();
std::string new_piece = tokenToString(new_tok.value());
cachedTokens.push_back(new_tok.value());
cachedResponse += new_piece;
Expand Down
Loading

0 comments on commit f9d6be8

Please sign in to comment.