Skip to content

Commit

Permalink
Add mmap and mlock parameters for LLaMA and Falcon models
Browse files Browse the repository at this point in the history
  • Loading branch information
marella committed Aug 15, 2023
1 parent aa38ade commit bef7eef
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 65 deletions.
26 changes: 22 additions & 4 deletions ctransformers/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
c_char_p,
c_void_p,
POINTER,
Structure,
)
from typing import (
Any,
Expand Down Expand Up @@ -55,6 +56,25 @@ class Config:
# model
context_length: int = -1
gpu_layers: int = 0
mmap: bool = True
mlock: bool = False

def to_struct(self):
return ConfigStruct(
context_length=self.context_length,
gpu_layers=self.gpu_layers,
mmap=self.mmap,
mlock=self.mlock,
)


class ConfigStruct(Structure):
_fields_ = [
("context_length", c_int),
("gpu_layers", c_int),
("mmap", c_bool),
("mlock", c_bool),
]


docs = OrderedDict(
Expand Down Expand Up @@ -106,8 +126,7 @@ def load_library(path: Optional[str] = None, cuda: bool = False) -> Any:
lib.ctransformers_llm_create.argtypes = [
c_char_p, # model_path
c_char_p, # model_type
c_int, # context_length
c_int, # gpu_layers
ConfigStruct, # config
]
lib.ctransformers_llm_create.restype = llm_p

Expand Down Expand Up @@ -209,8 +228,7 @@ def __init__(
self._llm = self._lib.ctransformers_llm_create(
model_path.encode(),
model_type.encode(),
config.context_length,
config.gpu_layers,
config.to_struct(),
)
if self._llm is None:
raise RuntimeError(
Expand Down
4 changes: 2 additions & 2 deletions models/llm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extern "C" {
#endif

LLM* ctransformers_llm_create(const char* model_path, const char* model_type,
const int context_length, const int gpu_layers) {
const Config config) {
std::string type = model_type;
// Remove non-alphanumeric characters from model type.
type.erase(std::remove_if(type.begin(), type.end(),
Expand Down Expand Up @@ -49,7 +49,7 @@ LLM* ctransformers_llm_create(const char* model_path, const char* model_type,
fprintf(stderr, "Model type '%s' is not supported.\n", model_type);
return nullptr;
}
if (!llm->Init(model_path, context_length, gpu_layers)) {
if (!llm->Init(model_path, config)) {
delete llm;
return nullptr;
}
Expand Down
74 changes: 39 additions & 35 deletions models/llm.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,22 @@ class RingBuffer {
int pos_ = 0;
};

struct Config {
int context_length;
int gpu_layers;
bool mmap;
bool mlock;
};

class LLM {
public:
virtual ~LLM(){};

bool Init(const std::string &filename, const int context_length,
const int gpu_layers) {
bool Init(const std::string &filename, const Config &config) {
if (initialized_) {
return false;
}
if (!Load(filename, context_length, gpu_layers)) {
if (!Load(filename, config)) {
return false;
}
previous_tokens_.Init(ContextLength());
Expand Down Expand Up @@ -161,8 +167,7 @@ class LLM {
std::vector<float> embeddings_;
RingBuffer previous_tokens_;

virtual bool Load(const std::string &filename, const int context_length,
const int gpu_layers) = 0;
virtual bool Load(const std::string &filename, const Config &config) = 0;

virtual bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads,
const int n_past) = 0;
Expand All @@ -189,36 +194,35 @@ class LLM {
}
};

#define REGISTER_LLM(_name) \
class _name##_llm : public LLM { \
public: \
virtual ~_name##_llm() { \
if (model_.ctx != nullptr) { \
ggml_free(model_.ctx); \
} \
} \
\
protected: \
bool Load(const std::string &filename, const int context_length, \
const int gpu_layers) override { \
if (context_length > 0) { \
model_.hparams.n_ctx = context_length; \
} \
if (!_name##_model_load(filename, model_, vocab_)) { \
return false; \
} \
n_ctx_ = model_.hparams.n_ctx; \
return true; \
} \
\
bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads, \
const int n_past) override { \
return _name##_eval(model_, threads, n_past, tokens, logits_, \
mem_per_token_); \
} \
\
private: \
_name##_model model_; \
#define REGISTER_LLM(_name) \
class _name##_llm : public LLM { \
public: \
virtual ~_name##_llm() { \
if (model_.ctx != nullptr) { \
ggml_free(model_.ctx); \
} \
} \
\
protected: \
bool Load(const std::string &filename, const Config &config) override { \
if (config.context_length > 0) { \
model_.hparams.n_ctx = config.context_length; \
} \
if (!_name##_model_load(filename, model_, vocab_)) { \
return false; \
} \
n_ctx_ = model_.hparams.n_ctx; \
return true; \
} \
\
bool Eval(const std::vector<gpt_vocab::id> &tokens, const int threads, \
const int n_past) override { \
return _name##_eval(model_, threads, n_past, tokens, logits_, \
mem_per_token_); \
} \
\
private: \
_name##_model model_; \
}

#endif
11 changes: 6 additions & 5 deletions models/llms/falcon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,15 @@ class falcon_llm : public LLM {
}

protected:
bool Load(const std::string &filename, const int context_length,
const int gpu_layers) override {
bool Load(const std::string &filename, const Config &config) override {
falcon_context_params params = falcon_context_default_params();
params.embedding = true;
if (context_length > 0) {
params.n_ctx = context_length;
if (config.context_length > 0) {
params.n_ctx = config.context_length;
}
params.n_gpu_layers = gpu_layers;
params.n_gpu_layers = config.gpu_layers;
params.use_mmap = config.mmap;
params.use_mlock = config.mlock;

ctx_ = falcon_init_from_file(filename.c_str(), params);
if (ctx_ == nullptr) {
Expand Down
11 changes: 6 additions & 5 deletions models/llms/llama.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ class llama_llm : public LLM {
}

protected:
bool Load(const std::string &filename, const int context_length,
const int gpu_layers) override {
bool Load(const std::string &filename, const Config &config) override {
llama_context_params params = llama_context_default_params();
params.embedding = true;
if (context_length > 0) {
params.n_ctx = context_length;
if (config.context_length > 0) {
params.n_ctx = config.context_length;
}
params.n_gpu_layers = gpu_layers;
params.n_gpu_layers = config.gpu_layers;
params.use_mmap = config.mmap;
params.use_mlock = config.mlock;
std::regex pattern_70b(R"((\b|_)70b(\b|_))", std::regex_constants::icase);
if (std::regex_search(filename, pattern_70b)) {
params.n_gqa = 8;
Expand Down
9 changes: 4 additions & 5 deletions models/llms/mpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -604,12 +604,11 @@ class mpt_llm : public LLM {
}

protected:
bool Load(const std::string &filename, const int context_length,
const int gpu_layers) override {
if (context_length > 0) {
model_.hparams.n_ctx = context_length;
bool Load(const std::string &filename, const Config &config) override {
if (config.context_length > 0) {
model_.hparams.n_ctx = config.context_length;
}
if (!mpt_model_load(filename, model_, vocab_, gpu_layers)) {
if (!mpt_model_load(filename, model_, vocab_, config.gpu_layers)) {
return false;
}
n_ctx_ = model_.hparams.n_ctx;
Expand Down
7 changes: 3 additions & 4 deletions models/llms/replit.cc
Original file line number Diff line number Diff line change
Expand Up @@ -647,10 +647,9 @@ class replit_llm : public LLM {

protected:
replit_tokenizer replit_tokenizer_;
bool Load(const std::string &filename, const int context_length,
const int gpu_layers) override {
if (context_length > 0) {
model_.hparams.n_ctx = context_length;
bool Load(const std::string &filename, const Config &config) override {
if (config.context_length > 0) {
model_.hparams.n_ctx = config.context_length;
}
if (!replit_model_load(filename, model_, replit_tokenizer_)) {
return false;
Expand Down
9 changes: 4 additions & 5 deletions models/llms/starcoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -775,12 +775,11 @@ class starcoder_llm : public LLM {
}

protected:
bool Load(const std::string &filename, const int context_length,
const int gpu_layers) override {
if (context_length > 0) {
model_.hparams.n_ctx = context_length;
bool Load(const std::string &filename, const Config &config) override {
if (config.context_length > 0) {
model_.hparams.n_ctx = config.context_length;
}
if (!starcoder_model_load(filename, model_, vocab_, gpu_layers)) {
if (!starcoder_model_load(filename, model_, vocab_, config.gpu_layers)) {
return false;
}
n_ctx_ = model_.hparams.n_ctx;
Expand Down

0 comments on commit bef7eef

Please sign in to comment.