forked from LostRuins/koboldcpp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
expose.cpp
289 lines (258 loc) · 10.5 KB
/
expose.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
//This is Concedo's shitty adapter for adding python bindings for llama
//Considerations:
//Don't want to use pybind11 due to dependencies on MSVCC
//ZERO or MINIMAL changes as possible to main.cpp - do not move their function declarations here!
//Leave main.cpp UNTOUCHED, We want to be able to update the repo and pull any changes automatically.
//No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
//Python will ALWAYS provide the memory, we just write to it.
#include <cassert>
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>
#include <cstdint>
#include "expose.h"
#include "model_adapter.cpp"
extern "C"
{
std::string platformenv, deviceenv, vulkandeviceenv;
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
static FileFormat file_format = FileFormat::BADFORMAT;
static FileFormatExtraMeta file_format_meta;
bool load_model(const load_model_inputs inputs)
{
std::string model = inputs.model_filename;
lora_filename = inputs.lora_filename;
lora_base = inputs.lora_base;
mmproj_filename = inputs.mmproj_filename;
int forceversion = inputs.forceversion;
file_format = check_file_format(model.c_str(),&file_format_meta);
if(forceversion!=0)
{
printf("\nWARNING: FILE FORMAT FORCED TO VER %d\nIf incorrect, loading may fail or crash.\n",forceversion);
file_format = (FileFormat)forceversion;
}
//first digit is whether configured, second is platform, third is devices
int cl_parseinfo = inputs.clblast_info;
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
putenv((char*)usingclblast.c_str());
cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
int platform = cl_parseinfo/10;
int devices = cl_parseinfo%10;
platformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
deviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
putenv((char*)platformenv.c_str());
putenv((char*)deviceenv.c_str());
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
if(vulkan_info_str!="")
{
vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)vulkandeviceenv.c_str());
}
executable_path = inputs.executable_path;
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5)
{
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
if (lr == ModelLoadResult::RETRY_LOAD)
{
if(file_format==FileFormat::GPTJ_1)
{
//if we tried 1 first, then try 3 and lastly 2
//otherwise if we tried 3 first, then try 2
file_format = FileFormat::GPTJ_4;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPTJ_3;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
//lastly try format 2
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPTJ_2;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
}
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
{
return false;
}
else
{
return true;
}
}
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3||file_format==FileFormat::GPT2_4)
{
printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPT2_3;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPT2_2;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
{
return false;
}
else
{
return true;
}
}
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5 || file_format==FileFormat::NEOX_6 || file_format==FileFormat::NEOX_7)
{
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
if (lr == ModelLoadResult::RETRY_LOAD)
{
if(file_format==FileFormat::NEOX_2)
{
file_format = FileFormat::NEOX_3;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
else
{
file_format = FileFormat::NEOX_5;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::NEOX_1;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format, file_format_meta);
}
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
{
return false;
}
else
{
return true;
}
}
else
{
if(file_format==FileFormat::MPT_1)
{
printf("\n---\nIdentified as MPT model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
else if(file_format==FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
{
printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
else if(file_format==FileFormat::GGUF_GENERIC)
{
printf("\n---\nIdentified as GGUF model: (ver %d)\nAttempting to Load...\n---\n", file_format);
}
else
{
printf("\n---\nUnidentified Model Encountered: (ver %d)\n---\n", file_format);
}
ModelLoadResult lr = gpttype_load_model(inputs, file_format, file_format_meta);
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
{
return false;
}
else
{
return true;
}
}
}
generation_outputs generate(const generation_inputs inputs)
{
return gpttype_generate(inputs);
}
bool sd_load_model(const sd_load_model_inputs inputs)
{
return sdtype_load_model(inputs);
}
sd_generation_outputs sd_generate(const sd_generation_inputs inputs)
{
return sdtype_generate(inputs);
}
bool whisper_load_model(const whisper_load_model_inputs inputs)
{
return whispertype_load_model(inputs);
}
whisper_generation_outputs whisper_generate(const whisper_generation_inputs inputs)
{
return whispertype_generate(inputs);
}
const char * new_token(int idx) {
if (generated_tokens.size() <= idx || idx < 0) return nullptr;
return generated_tokens[idx].c_str();
}
int get_stream_count() {
return generated_tokens.size();
}
bool has_finished() {
return generation_finished;
}
float get_last_eval_time() {
return last_eval_time;
}
float get_last_process_time() {
return last_process_time;
}
int get_last_token_count() {
return last_token_count;
}
int get_last_seed()
{
return last_seed;
}
int get_total_gens() {
return total_gens;
}
int get_total_img_gens()
{
return total_img_gens;
}
int get_last_stop_reason() {
return (int)last_stop_reason;
}
const char* get_pending_output() {
return gpttype_get_pending_output().c_str();
}
bool abort_generate() {
return gpttype_generate_abort();
}
static std::vector<int> toks; //just share a static object for token counting
token_count_outputs token_count(const char * input, bool addbos)
{
std::string inputstr = input;
token_count_outputs output;
toks = gpttype_get_token_arr(inputstr,addbos);
output.count = toks.size();
output.ids = toks.data(); //this may be slightly unsafe
return output;
}
}