backend: rebase llama.cpp on upstream as of Sep 26th (#2998)

Signed-off-by: Jared Van Bortel <[email protected]>
nomic-ai · Sep 27, 2024 · f9d6be8 · f9d6be8
1 parent 8bd937e
commit f9d6be8
Show file tree

Hide file tree

Showing 16 changed files with 166 additions and 601 deletions.
diff --git a/gpt4all-backend/deps/llama.cpp-mainline b/gpt4all-backend/deps/llama.cpp-mainline
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@@ -7,6 +7,7 @@
 #include <cstdint>
 #include <functional>
 #include <optional>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@@ -149,9 +150,9 @@ class LLModel {
  virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
  virtual bool isModelLoaded() const = 0;
  virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
- virtual size_t stateSize() const { return 0; }
- virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
- virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
+ virtual size_t stateSize() const = 0;
+ virtual size_t saveState(std::span<uint8_t> dest) const = 0;
+ virtual size_t restoreState(std::span<const uint8_t> src) = 0;
 
  // This method requires the model to return true from supportsCompletion otherwise it will throw
  // an error
@@ -215,7 +216,8 @@ class LLModel {
  virtual std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special = false) = 0;
  virtual bool isSpecialToken(Token id) const = 0;
  virtual std::string tokenToString(Token id) const = 0;
- virtual Token sampleToken(PromptContext &ctx) const = 0;
+ virtual void initSampler(PromptContext &ctx) = 0;
+ virtual Token sampleToken() const = 0;
  virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
  virtual void shiftContext(PromptContext &promptCtx) = 0;
  virtual int32_t contextLength() const = 0;

diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@@ -148,18 +148,20 @@ uint64_t llmodel_get_state_size(llmodel_model model);
  * NOTE: This state data is specific to the type of model you have created.
  * @param model A pointer to the llmodel_model instance.
  * @param dest A pointer to the destination.
- * @return the number of bytes copied
+ * @param size The size of the destination buffer.
+ * @return the number of bytes copied, or zero on error.
  */
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size);
 
 /**
  * Restores the internal state of the model using data from the specified address.
  * NOTE: This state data is specific to the type of model you have created.
  * @param model A pointer to the llmodel_model instance.
- * @param src A pointer to the src.
- * @return the number of bytes read
+ * @param src A pointer to the state data.
+ * @param size The size of the source data.
+ * @return The number of bytes read, or zero on error.
  */
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, size_t size);
 
 /**
  * Generate a response using the model.

diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
@@ -978,10 +978,13 @@ function(include_ggml SUFFIX)
 
  add_library(llama${SUFFIX} STATIC
  ${DIRECTORY}/include/llama.h
+ ${DIRECTORY}/src/llama-grammar.cpp
+ ${DIRECTORY}/src/llama-sampling.cpp
+ ${DIRECTORY}/src/llama-vocab.cpp
  ${DIRECTORY}/src/llama.cpp
- ${DIRECTORY}/src/unicode.h
- ${DIRECTORY}/src/unicode.cpp
  ${DIRECTORY}/src/unicode-data.cpp
+ ${DIRECTORY}/src/unicode.cpp
+ ${DIRECTORY}/src/unicode.h
  )
 
  target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY}/include ${DIRECTORY}/ggml/include)

diff --git a/gpt4all-backend/src/llamamodel.cpp b/gpt4all-backend/src/llamamodel.cpp
@@ -2,6 +2,7 @@
 #include "llamamodel_impl.h"
 
 #include "llmodel.h"
+#include "utils.h"
 
 #include <ggml.h>
 #include <llama.h>
@@ -103,26 +104,34 @@ static bool llama_verbose()
  return var && *var;
 }
 
-static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
+static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
 {
  (void)userdata;
- if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
- fputs(text, stderr);
- }
-}
 
-#ifdef GGML_USE_CUDA
-static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
-{
- (void)userdata;
- if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
- fputs(text, stderr);
+ static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
+ if (!llama_verbose()) {
+ auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
+ lastlevel = efflevel;
+ switch (efflevel) {
+ case GGML_LOG_LEVEL_CONT:
+ UNREACHABLE();
+ break;
+ case GGML_LOG_LEVEL_WARN:
+ if (warn) break;
+ [[fallthrough]];
+ case GGML_LOG_LEVEL_NONE: // not used?
+ case GGML_LOG_LEVEL_INFO:
+ case GGML_LOG_LEVEL_DEBUG:
+ return; // suppress
+ case GGML_LOG_LEVEL_ERROR:
+ ;
+ }
  }
+
+ fputs(text, stderr);
 }
-#endif
 
 struct gpt_params {
- int32_t seed = -1; // RNG seed
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
 
  // sampling parameters
@@ -137,44 +146,6 @@ struct gpt_params {
  bool use_mlock = false; // use mlock to keep model in memory
 };
 
-static llama_token llama_sample_top_p_top_k(
- llama_context *ctx,
- const llama_token *last_n_tokens_data,
- int last_n_tokens_size,
- int top_k,
- float top_p,
- float min_p,
- float temp,
- float repeat_penalty) {
- auto logits = llama_get_logits_ith(ctx, -1);
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
- // Populate initial list of all candidates
- std::vector<llama_token_data> candidates;
- candidates.reserve(n_vocab);
- for (int token_id = 0; token_id < n_vocab; token_id++) {
- candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
- }
- llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
- // Sample repeat penalty
- llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
-
- llama_token id;
- if (temp == 0.0) {
- // greedy sampling, no probs
- id = llama_sample_token_greedy(ctx, &candidates_p);
- } else {
- // temperature sampling
- llama_sample_top_k(ctx, &candidates_p, top_k, 1);
- llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
- llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
- llama_sample_top_p(ctx, &candidates_p, top_p, 1);
- llama_sample_min_p(ctx, &candidates_p, min_p, 1);
- llama_sample_temp(ctx, &candidates_p, temp);
- id = llama_sample_token(ctx, &candidates_p);
- }
- return id;
-}
-
 const char *get_arch_name(gguf_context *ctx_gguf)
 {
  const int kid = gguf_find_key(ctx_gguf, "general.architecture");
@@ -241,21 +212,26 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
 }
 
 struct LLamaPrivate {
- const std::string modelPath;
- bool modelLoaded = false;
- int device = -1;
- std::string deviceName;
- llama_model *model = nullptr;
- llama_context *ctx = nullptr;
- llama_model_params model_params;
- llama_context_params ctx_params;
- int64_t n_threads = 0;
- std::vector<LLModel::Token> end_tokens;
- const char *backend_name = nullptr;
+ bool modelLoaded = false;
+ int device = -1;
+ std::string deviceName;
+ int64_t n_threads = 0;
+ std::vector<LLModel::Token> end_tokens;
+ const char *backend_name = nullptr;
+
+ llama_model *model = nullptr;
+ llama_context *ctx = nullptr;
+ llama_model_params model_params;
+ llama_context_params ctx_params;
+ llama_sampler *sampler_chain;
 };
 
 LLamaModel::LLamaModel()
- : d_ptr(new LLamaPrivate) {}
+ : d_ptr(std::make_unique<LLamaPrivate>())
+{
+ auto sparams = llama_sampler_chain_default_params();
+ d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
+}
 
 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@@ -444,10 +420,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
  }
  }
 
- d_ptr->ctx_params.n_ctx = n_ctx;
- d_ptr->ctx_params.seed = params.seed;
- d_ptr->ctx_params.type_k = params.kv_type;
- d_ptr->ctx_params.type_v = params.kv_type;
+ d_ptr->ctx_params.n_ctx = n_ctx;
+ d_ptr->ctx_params.type_k = params.kv_type;
+ d_ptr->ctx_params.type_v = params.kv_type;
 
  // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
  // that we want this many logits so the state serializes consistently.
@@ -513,6 +488,7 @@ LLamaModel::~LLamaModel()
  llama_free(d_ptr->ctx);
  }
  llama_free_model(d_ptr->model);
+ llama_sampler_free(d_ptr->sampler_chain);
 }
 
 bool LLamaModel::isModelLoaded() const
@@ -522,18 +498,17 @@ bool LLamaModel::isModelLoaded() const
 
 size_t LLamaModel::stateSize() const
 {
- return llama_get_state_size(d_ptr->ctx);
+ return llama_state_get_size(d_ptr->ctx);
 }
 
-size_t LLamaModel::saveState(uint8_t *dest) const
+size_t LLamaModel::saveState(std::span<uint8_t> dest) const
 {
- return llama_copy_state_data(d_ptr->ctx, dest);
+ return llama_state_get_data(d_ptr->ctx, dest.data(), dest.size());
 }
 
-size_t LLamaModel::restoreState(const uint8_t *src)
+size_t LLamaModel::restoreState(std::span<const uint8_t> src)
 {
- // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
- return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
+ return llama_state_set_data(d_ptr->ctx, src.data(), src.size());
 }
 
 std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, std::string_view str, bool special)
@@ -573,13 +548,50 @@ std::string LLamaModel::tokenToString(Token id) const
  return std::string(result.data(), result.size());
 }
 
-LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
+void LLamaModel::initSampler(PromptContext &promptCtx)
+{
+ auto *model = d_ptr->model;
+ auto *chain = d_ptr->sampler_chain;
+
+ // clear sampler chain
+ for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
+ auto *smpl = llama_sampler_chain_remove(chain, i);
+ llama_sampler_free(smpl);
+ }
+
+ // build new chain
+ llama_sampler_chain_add(chain,
+ llama_sampler_init_penalties(
+ llama_n_vocab(model),
+ llama_token_eos(model),
+ llama_token_nl(model),
+ promptCtx.repeat_last_n,
+ promptCtx.repeat_penalty,
+ // TODO(jared): consider making the below configurable
+ /*penalty_freq*/ 0.0f,
+ /*penalty_present*/ 0.0f,
+ /*penalize_nl*/ true,
+ /*ignore_eos*/ false
+ )
+ );
+ if (promptCtx.temp == 0.0f) {
+ llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+ } else {
+ struct llama_sampler *samplers[] = {
+ llama_sampler_init_top_k(promptCtx.top_k),
+ llama_sampler_init_top_p(promptCtx.top_p, 1),
+ llama_sampler_init_min_p(promptCtx.min_p, 1),
+ llama_sampler_init_temp(promptCtx.temp),
+ llama_sampler_init_dist(LLAMA_DEFAULT_SEED)
+ };
+ for (auto *smpl : samplers)
+ llama_sampler_chain_add(chain, smpl);
+ }
+}
+
+LLModel::Token LLamaModel::sampleToken() const
 {
- const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
- return llama_sample_top_p_top_k(d_ptr->ctx,
- promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
- n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
- promptCtx.repeat_penalty);
+ return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
 }
 
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
@@ -1227,9 +1239,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
 
 DLL_EXPORT LLModel *construct()
 {
- llama_log_set(llama_log_callback, nullptr);
+ llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
 #ifdef GGML_USE_CUDA
- ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
+ ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
 #endif
  return new LLamaModel;
 }

diff --git a/gpt4all-backend/src/llamamodel_impl.h b/gpt4all-backend/src/llamamodel_impl.h
@@ -7,6 +7,7 @@
 #include "llmodel.h"
 
 #include <memory>
+#include <span>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -27,8 +28,8 @@ class LLamaModel : public LLModel {
  bool isModelLoaded() const override;
  size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
  size_t stateSize() const override;
- size_t saveState(uint8_t *dest) const override;
- size_t restoreState(const uint8_t *src) override;
+ size_t saveState(std::span<uint8_t> dest) const override;
+ size_t restoreState(std::span<const uint8_t> src) override;
  void setThreadCount(int32_t n_threads) override;
  int32_t threadCount() const override;
  std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
@@ -56,7 +57,8 @@ class LLamaModel : public LLModel {
  std::vector<Token> tokenize(PromptContext &ctx, std::string_view str, bool special) override;
  bool isSpecialToken(Token id) const override;
  std::string tokenToString(Token id) const override;
- Token sampleToken(PromptContext &ctx) const override;
+ void initSampler(PromptContext &ctx) override;
+ Token sampleToken() const override;
  bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
  void shiftContext(PromptContext &promptCtx) override;
  int32_t contextLength() const override;

diff --git a/gpt4all-backend/src/llmodel_c.cpp b/gpt4all-backend/src/llmodel_c.cpp
@@ -91,16 +91,16 @@ uint64_t llmodel_get_state_size(llmodel_model model)
  return wrapper->llModel->stateSize();
 }
 
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest, uint64_t size)
 {
  auto *wrapper = static_cast<LLModelWrapper *>(model);
- return wrapper->llModel->saveState(dest);
+ return wrapper->llModel->saveState({dest, size_t(size)});
 }
 
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src, uint64_t size)
 {
  auto *wrapper = static_cast<LLModelWrapper *>(model);
- return wrapper->llModel->restoreState(src);
+ return wrapper->llModel->restoreState({src, size_t(size)});
 }
 
 void llmodel_prompt(llmodel_model model, const char *prompt,

diff --git a/gpt4all-backend/src/llmodel_shared.cpp b/gpt4all-backend/src/llmodel_shared.cpp
@@ -244,14 +244,16 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
  return;
  }
 
+ initSampler(promptCtx);
+
  std::string cachedResponse;
  std::vector<Token> cachedTokens;
  int n_predicted = 0;
 
  // Predict next tokens
  for (bool stop = false; !stop;) {
  // Sample next token
- std::optional<Token> new_tok = sampleToken(promptCtx);
+ std::optional<Token> new_tok = sampleToken();
  std::string new_piece = tokenToString(new_tok.value());
  cachedTokens.push_back(new_tok.value());
  cachedResponse += new_piece;