forked from dusty-nv/jetson-containers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
faiss_lite.py
202 lines (167 loc) · 5.95 KB
/
faiss_lite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env python3
# exposes cudaKNN() and cudaL2Norm() functions from fiass_lite.cu to Python via ctypes
# see benchmark.py for example usage from Python
import os
import math
import torch
import ctypes as C
import numpy as np
from cuda import cuda, nvrtc
from cuda.cudart import (
cudaMallocManaged,
cudaHostAlloc,
cudaHostAllocMapped,
cudaHostGetDevicePointer,
cudaMemAttachGlobal,
cudaGetLastError,
cudaGetErrorString,
cudaError_t
)
_lib = C.CDLL('/opt/faiss_lite/build/libfaiss_lite.so')
# https://github.com/facebookresearch/faiss/blob/main/faiss/MetricType.h
DistanceMetrics = {
'inner_product': 0,
'l2': 1,
'l1': 2,
'Linf': 3,
'canberra': 20,
'braycurtis': 21,
'jensenshannon': 22,
'jaccard': 23,
'cosine': 99, # custom: normalized inner_product
}
def _cudaKNN(name='cudaKNN'):
func = _lib[name]
func.argtypes = [
C.c_void_p, # vectors
C.c_void_p, # queries
C.c_int, # dsize
C.c_int, # n
C.c_int, # m
C.c_int, # d
C.c_int, # k
C.c_int, # metric
C.POINTER(C.c_float), # vector_norms
C.POINTER(C.c_float), # out_distances
C.POINTER(C.c_longlong), # out_indices
C.c_void_p, # cudaStream_t
]
func.restype = C.c_bool
return func
def _cudaL2Norm(name='cudaL2Norm'):
func = _lib[name]
func.argtypes = [
C.c_void_p, # vectors
C.c_int, # dsize
C.c_int, # n
C.c_int, # d
C.POINTER(C.c_float), # output
C.c_bool, # squared
C.c_void_p, # cudaStream_t
]
func.restype = C.c_bool
return func
cudaKNN = _cudaKNN()
cudaL2Norm = _cudaL2Norm()
class AttrDict(dict):
"""
A dict where keys are available as attributes
https://stackoverflow.com/a/14620633
"""
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
class cudaArrayInterface():
"""
Exposes __cuda_array_interface__ - typically used as a temporary view into a larger buffer
https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html
"""
def __init__(self, data, shape, dtype=np.float32):
self.__cuda_array_interface__ = {
'data': (data, False), # R/W
'shape': shape,
'typestr': np.dtype(dtype).str,
'version': 3,
}
def dtype_to_ctype(dtype):
if isinstance(dtype, str):
dtype = np.dtype(dtype)
if dtype == np.float16:
return C.c_ushort
else:
return np.ctypeslib.as_ctypes_type(dtype)
def cudaAllocMapped(shape, dtype, map_numpy=True, map_torch=True, return_dict=True):
"""
Allocate cudaMallocManaged() memory and map it to a numpy array and PyTorch tensor
If return dict is true, these will be returned in a dict-like DictAttr object
with keys for 'ptr', 'array' (if map_numpy is True), and 'tensor' (if map_torch is True).
Otherwise, a tuple will be returned with (ptr, array, tensor)
"""
dsize = np.dtype(dtype).itemsize
if isinstance(shape, int):
size = shape * dsize
shape = [shape]
else:
size = math.prod(shape) * dsize
print(f"-- allocating {size} bytes ({size/(1024*1024):.2f} MB) with cudaMallocManaged()")
#err, ptr = cudaMallocManaged(size, cudaMemAttachGlobal)
err, ptr = cudaHostAlloc(size, cudaHostAllocMapped)
assert_cuda(err)
err, ptr = cudaHostGetDevicePointer(ptr, 0)
if map_numpy:
array = cudaToNumpy(ptr, shape, dtype)
if map_torch:
tensor = cudaToTorch(ptr, shape, dtype)
if return_dict:
d = AttrDict()
d.ptr = ptr
d.shape = shape
d.dtype = dtype
if map_numpy:
d.array = array
if map_torch:
d.tensor = tensor
return d
else:
if map_numpy and map_torch:
return ptr, array, tensor
elif map_numpy:
return ptr, array
elif map_torch:
return ptr, tensor
else:
return ptr
def cudaToNumpy(ptr, shape, dtype):
"""
Map a shared CUDA pointer into np.ndarray with the given shape and datatype.
The pointer should have been allocated with cudaMallocManaged() or using cudaHostAllocMapped,
and the user is responsible for any CPU/GPU synchronization (i.e. by using cudaStreams)
"""
array = np.ctypeslib.as_array(C.cast(ptr, C.POINTER(dtype_to_ctype(dtype))), shape=shape)
if dtype == np.float16:
array.dtype = np.float16
return array
def cudaToTorch(ptr, shape, dtype):
"""
Map a shared CUDA pointer into np.ndarray with the given shape and datatype.
The pointer should have been allocated with cudaMallocManaged() or using cudaHostAllocMapped,
and the user is responsible for any CPU/GPU synchronization (i.e. by using cudaStreams)
"""
return torch.as_tensor(cudaArrayInterface(ptr, shape, dtype), device='cuda')
def assert_cuda(err):
"""
Throw a runtime exception if a CUDA error occurred
"""
if isinstance(err, tuple) and len(err) == 1:
err = err[0]
if isinstance(err, cuda.CUresult):
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {err} -- {cudaGetErrorString(err)[1]}")
elif isinstance(err, cudaError_t):
if err != cudaError_t.cudaSuccess:
raise RuntimeError(f"CUDA Error {err} -- {cudaGetErrorString(err)[1]}")
elif isinstance(err, nvrtc.nvrtcResult):
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError(f"nvrtc Error {err}")
else:
raise RuntimeError(f"Unknown error type: {err}")