Add mmap and mlock parameters for LLaMA and Falcon models

marella · Aug 15, 2023 · bef7eef · bef7eef
1 parent aa38ade
commit bef7eef
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 65 deletions.
diff --git a/ctransformers/llm.py b/ctransformers/llm.py
@@ -13,6 +13,7 @@
     c_char_p,
     c_void_p,
     POINTER,
+    Structure,
 )
 from typing import (
     Any,
@@ -55,6 +56,25 @@ class Config:
     # model
     context_length: int = -1
     gpu_layers: int = 0
+    mmap: bool = True
+    mlock: bool = False
+
+    def to_struct(self):
+        return ConfigStruct(
+            context_length=self.context_length,
+            gpu_layers=self.gpu_layers,
+            mmap=self.mmap,
+            mlock=self.mlock,
+        )
+
+
+class ConfigStruct(Structure):
+    _fields_ = [
+        ("context_length", c_int),
+        ("gpu_layers", c_int),
+        ("mmap", c_bool),
+        ("mlock", c_bool),
+    ]
 
 
 docs = OrderedDict(
@@ -106,8 +126,7 @@ def load_library(path: Optional[str] = None, cuda: bool = False) -> Any:
     lib.ctransformers_llm_create.argtypes = [
         c_char_p,  # model_path
         c_char_p,  # model_type
-        c_int,  # context_length
-        c_int,  # gpu_layers
+        ConfigStruct,  # config
     ]
     lib.ctransformers_llm_create.restype = llm_p
 
@@ -209,8 +228,7 @@ def __init__(
         self._llm = self._lib.ctransformers_llm_create(
             model_path.encode(),
             model_type.encode(),
-            config.context_length,
-            config.gpu_layers,
+            config.to_struct(),
         )
         if self._llm is None:
             raise RuntimeError(

diff --git a/models/llm.cc b/models/llm.cc
@@ -17,7 +17,7 @@ extern "C" {
 #endif
 
 LLM* ctransformers_llm_create(const char* model_path, const char* model_type,
-                              const int context_length, const int gpu_layers) {
+                              const Config config) {
   std::string type = model_type;
   // Remove non-alphanumeric characters from model type.
   type.erase(std::remove_if(type.begin(), type.end(),
@@ -49,7 +49,7 @@ LLM* ctransformers_llm_create(const char* model_path, const char* model_type,
     fprintf(stderr, "Model type '%s' is not supported.\n", model_type);
     return nullptr;
   }
-  if (!llm->Init(model_path, context_length, gpu_layers)) {
+  if (!llm->Init(model_path, config)) {
     delete llm;
     return nullptr;
   }

diff --git a/models/llm.h b/models/llm.h
@@ -51,16 +51,22 @@ class RingBuffer {
   int pos_ = 0;
 };
 
+struct Config {
+  int context_length;
+  int gpu_layers;
+  bool mmap;
+  bool mlock;
+};
+
 class LLM {
  public:
   virtual ~LLM(){};
 
-  bool Init(const std::string &filename, const int context_length,
-            const int gpu_layers) {
+  bool Init(const std::string &filename, const Config &config) {
     if (initialized_) {
       return false;
     }
-    if (!Load(filename, context_length, gpu_layers)) {
+    if (!Load(filename, config)) {
       return false;
     }
     previous_tokens_.Init(ContextLength());
@@ -161,8 +167,7 @@ class LLM {
   std::vector<float> embeddings_;
   RingBuffer previous_tokens_;
 
-  virtual bool Load(const std::string &filename, const int context_length,
-                    const int gpu_layers) = 0;
+  virtual bool Load(const std::string &filename, const Config &config) = 0;
 
   virtual bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads,
                     const int n_past) = 0;
@@ -189,36 +194,35 @@ class LLM {
   }
 };
 
-#define REGISTER_LLM(_name)                                                \
-  class _name##_llm : public LLM {                                         \
-   public:                                                                 \
-    virtual ~_name##_llm() {                                               \
-      if (model_.ctx != nullptr) {                                         \
-        ggml_free(model_.ctx);                                             \
-      }                                                                    \
-    }                                                                      \
-                                                                           \
-   protected:                                                              \
-    bool Load(const std::string &filename, const int context_length,       \
-              const int gpu_layers) override {                             \
-      if (context_length > 0) {                                            \
-        model_.hparams.n_ctx = context_length;                             \
-      }                                                                    \
-      if (!_name##_model_load(filename, model_, vocab_)) {                 \
-        return false;                                                      \
-      }                                                                    \
-      n_ctx_ = model_.hparams.n_ctx;                                       \
-      return true;                                                         \
-    }                                                                      \
-                                                                           \
-    bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads, \
-              const int n_past) override {                                 \
-      return _name##_eval(model_, threads, n_past, tokens, logits_,        \
-                          mem_per_token_);                                 \
-    }                                                                      \
-                                                                           \
-   private:                                                                \
-    _name##_model model_;                                                  \
+#define REGISTER_LLM(_name)                                                 \
+  class _name##_llm : public LLM {                                          \
+   public:                                                                  \
+    virtual ~_name##_llm() {                                                \
+      if (model_.ctx != nullptr) {                                          \
+        ggml_free(model_.ctx);                                              \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+   protected:                                                               \
+    bool Load(const std::string &filename, const Config &config) override { \
+      if (config.context_length > 0) {                                      \
+        model_.hparams.n_ctx = config.context_length;                       \
+      }                                                                     \
+      if (!_name##_model_load(filename, model_, vocab_)) {                  \
+        return false;                                                       \
+      }                                                                     \
+      n_ctx_ = model_.hparams.n_ctx;                                        \
+      return true;                                                          \
+    }                                                                       \
+                                                                            \
+    bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads,  \
+              const int n_past) override {                                  \
+      return _name##_eval(model_, threads, n_past, tokens, logits_,         \
+                          mem_per_token_);                                  \
+    }                                                                       \
+                                                                            \
+   private:                                                                 \
+    _name##_model model_;                                                   \
   }
 
 #endif
diff --git a/models/llms/falcon.cc b/models/llms/falcon.cc
@@ -83,14 +83,15 @@ class falcon_llm : public LLM {
   }
 
  protected:
-  bool Load(const std::string &filename, const int context_length,
-            const int gpu_layers) override {
+  bool Load(const std::string &filename, const Config &config) override {
     falcon_context_params params = falcon_context_default_params();
     params.embedding = true;
-    if (context_length > 0) {
-      params.n_ctx = context_length;
+    if (config.context_length > 0) {
+      params.n_ctx = config.context_length;
     }
-    params.n_gpu_layers = gpu_layers;
+    params.n_gpu_layers = config.gpu_layers;
+    params.use_mmap = config.mmap;
+    params.use_mlock = config.mlock;
 
     ctx_ = falcon_init_from_file(filename.c_str(), params);
     if (ctx_ == nullptr) {

diff --git a/models/llms/llama.cc b/models/llms/llama.cc
@@ -82,14 +82,15 @@ class llama_llm : public LLM {
   }
 
  protected:
-  bool Load(const std::string &filename, const int context_length,
-            const int gpu_layers) override {
+  bool Load(const std::string &filename, const Config &config) override {
     llama_context_params params = llama_context_default_params();
     params.embedding = true;
-    if (context_length > 0) {
-      params.n_ctx = context_length;
+    if (config.context_length > 0) {
+      params.n_ctx = config.context_length;
     }
-    params.n_gpu_layers = gpu_layers;
+    params.n_gpu_layers = config.gpu_layers;
+    params.use_mmap = config.mmap;
+    params.use_mlock = config.mlock;
     std::regex pattern_70b(R"((\b|_)70b(\b|_))", std::regex_constants::icase);
     if (std::regex_search(filename, pattern_70b)) {
       params.n_gqa = 8;

diff --git a/models/llms/mpt.cc b/models/llms/mpt.cc
@@ -604,12 +604,11 @@ class mpt_llm : public LLM {
   }
 
  protected:
-  bool Load(const std::string &filename, const int context_length,
-            const int gpu_layers) override {
-    if (context_length > 0) {
-      model_.hparams.n_ctx = context_length;
+  bool Load(const std::string &filename, const Config &config) override {
+    if (config.context_length > 0) {
+      model_.hparams.n_ctx = config.context_length;
     }
-    if (!mpt_model_load(filename, model_, vocab_, gpu_layers)) {
+    if (!mpt_model_load(filename, model_, vocab_, config.gpu_layers)) {
       return false;
     }
     n_ctx_ = model_.hparams.n_ctx;

diff --git a/models/llms/replit.cc b/models/llms/replit.cc
@@ -647,10 +647,9 @@ class replit_llm : public LLM {
 
  protected:
   replit_tokenizer replit_tokenizer_;
-  bool Load(const std::string &filename, const int context_length,
-            const int gpu_layers) override {
-    if (context_length > 0) {
-      model_.hparams.n_ctx = context_length;
+  bool Load(const std::string &filename, const Config &config) override {
+    if (config.context_length > 0) {
+      model_.hparams.n_ctx = config.context_length;
     }
     if (!replit_model_load(filename, model_, replit_tokenizer_)) {
       return false;

diff --git a/models/llms/starcoder.cc b/models/llms/starcoder.cc
@@ -775,12 +775,11 @@ class starcoder_llm : public LLM {
   }
 
  protected:
-  bool Load(const std::string &filename, const int context_length,
-            const int gpu_layers) override {
-    if (context_length > 0) {
-      model_.hparams.n_ctx = context_length;
+  bool Load(const std::string &filename, const Config &config) override {
+    if (config.context_length > 0) {
+      model_.hparams.n_ctx = config.context_length;
     }
-    if (!starcoder_model_load(filename, model_, vocab_, gpu_layers)) {
+    if (!starcoder_model_load(filename, model_, vocab_, config.gpu_layers)) {
       return false;
     }
     n_ctx_ = model_.hparams.n_ctx;