This repository has been archived by the owner on Aug 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 38
/
ne.h
259 lines (202 loc) · 5.22 KB
/
ne.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
// Copyright (c) 2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef NE_SHARED
#if defined(_WIN32) && !defined(__MINGW32__)
#ifdef NE_BUILD
#define NE_API __declspec(dllexport)
#else
#define NE_API __declspec(dllimport)
#endif
#else
#define NE_API __attribute__((visibility("default")))
#endif
#else
#define NE_API
#endif
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include "core/data_types.h"
#include "layers/Ops.h"
#ifdef NS_TP_MODEL
#include "core/parallel_context.h"
#endif
#define NE_FILE_MAGIC 0x67676d6c // "ne"
#define NE_FILE_VERSION 1
#define NE_MAX_DIMS 4
#define NE_MAX_NODES 40960
#define NE_MAX_PARAMS 256
#define NE_MAX_CONTEXTS 64
#define NE_MAX_OPT 36
#define NE_DEFAULT_N_THREADS 4
#define NE_MAX_OP_PARAMS 32
#define NE_SIZE_CALC -1
#define NE_ALIGNMENT 64
#define NE_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "NE_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
//
// logging
//
#define NE_DEBUG 0
#if (NE_DEBUG >= 1)
#define NE_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define NE_PRINT_DEBUG(...)
#endif
#if (NE_DEBUG >= 5)
#define NE_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define NE_PRINT_DEBUG_5(...)
#endif
#if (NE_DEBUG >= 10)
#define NE_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define NE_PRINT_DEBUG_10(...)
#endif
#define NE_PRINT(...) printf(__VA_ARGS__)
#ifdef __cplusplus
extern "C" {
#endif
struct ne_object;
struct ne_context;
enum ne_backend {
NE_BACKEND_CPU = 0,
NE_BACKEND_SYCL = 1,
};
// ne object
struct ne_object {
size_t offs;
size_t size;
struct ne_object* next;
char padding[40]; // 8 for NE_MEM_ALIGN=16, 40 for NE_MEM_ALIGN=64
};
static const size_t NE_OBJECT_SIZE = sizeof(struct ne_object);
// scratch buffer
struct ne_scratch {
size_t offs;
size_t size;
void* data;
};
//
// ne context
//
#define MAX_SYCL_BUFFER_SIZE (4000ull << 20) // 4GB
#define MAX_SYCL_BUFFER_COUNT 64 // 32*4GB=128GB
struct ne_sycl_context {
void* dev;
void* queue;
int n_buffers;
void* buffers[MAX_SYCL_BUFFER_COUNT];
size_t offs[MAX_SYCL_BUFFER_COUNT];
size_t offs_save[MAX_SYCL_BUFFER_COUNT];
size_t sizes[MAX_SYCL_BUFFER_COUNT];
};
struct ne_context {
size_t mem_size;
void* mem_buffer;
bool mem_buffer_owned;
bool no_alloc;
int n_objects;
struct ne_object* objects_begin;
struct ne_object* objects_end;
struct ne_scratch scratch;
struct ne_scratch scratch_save;
struct ne_object* objects_save;
struct ne_sycl_context* dev_ctx;
};
struct ne_context_container {
bool used;
struct ne_context context;
};
// n-dimensional tensor
struct ne_tensor {
enum ne_type type;
enum ne_backend backend;
int n_dims;
int64_t ne[NE_MAX_DIMS]; // number of elements
size_t nb[NE_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1]
// compute data
enum ne_op op;
bool is_param;
// op params - allocated as int32_t for alignment
int32_t op_params[NE_MAX_OP_PARAMS / sizeof(int32_t)];
struct ne_tensor* grad;
struct ne_tensor* src0;
struct ne_tensor* src1;
struct ne_tensor* opt[NE_MAX_OPT];
// thread scheduling
int n_tasks;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
void* data;
size_t size;
char name[32];
char padding[8];
};
static const size_t NE_TENSOR_SIZE = sizeof(struct ne_tensor);
// computation graph
struct ne_cgraph {
int n_nodes;
int n_leafs;
int n_threads;
size_t work_size;
struct ne_tensor* work;
size_t dev_work_size;
struct ne_tensor* dev_work;
struct ne_tensor* nodes[NE_MAX_NODES];
struct ne_tensor* grads[NE_MAX_NODES];
struct ne_tensor* leafs[NE_MAX_NODES];
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
};
struct ne_init_params {
// memory pool
size_t mem_size; // bytes
void* mem_buffer; // if nullptr, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
//
// compute types
//
enum ne_task_type {
NE_TASK_INIT = 0,
NE_TASK_COMPUTE,
NE_TASK_FINALIZE,
};
struct ne_compute_params {
enum ne_task_type type;
int ith, nth;
// work buffer for all threads
size_t wsize;
void* wdata;
size_t dev_wsize;
void* dev_wdata;
void* dev_queue;
};
#ifdef __cplusplus
}
#endif