forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DispatchStub.h
204 lines (177 loc) · 6.69 KB
/
DispatchStub.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#pragma once
#include <c10/core/Backend.h>
#include <c10/core/ScalarType.h>
#include <c10/util/Exception.h>
#include <type_traits>
// Implements instruction set specific function dispatch.
//
// Kernels that may make use of specialized instruction sets (e.g. AVX) are
// compiled multiple times with different compiler flags (e.g. -mavx). A
// DispatchStub contains a table of function pointers for a kernel. At runtime,
// the fastest available kernel is chosen based on the features reported by
// cpuinfo.
//
// Example:
//
// In native/MyKernel.h:
// using fn_type = void(*)(const Tensor& x);
// DECLARE_DISPATCH(fn_type, stub);
//
// In native/MyKernel.cpp
// DEFINE_DISPATCH(stub);
//
// In native/cpu/MyKernel.cpp:
// namespace {
// // use anonymous namespace so that different cpu versions won't conflict
// void kernel(const Tensor& x) { ... }
// }
// REGISTER_DISPATCH(stub, &kernel);
//
// To call:
// stub(kCPU, tensor);
//
// TODO: CPU instruction set selection should be folded into whatever
// the main dispatch mechanism is.
// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wundefined-var-template"
#endif
namespace at { namespace native {
enum class CPUCapability {
DEFAULT = 0,
AVX = 1,
AVX2 = 2,
NUM_OPTIONS
};
CPUCapability get_cpu_capability();
template <typename FnPtr, typename T>
struct CAFFE2_API DispatchStub;
template <typename rT, typename T, typename... Args>
struct CAFFE2_API DispatchStub<rT (*)(Args...), T> {
using FnPtr = rT (*) (Args...);
DispatchStub() = default;
DispatchStub(const DispatchStub&) = delete;
DispatchStub& operator=(const DispatchStub&) = delete;
template <typename... ArgTypes>
rT operator()(DeviceType device_type, ArgTypes&&... args) {
if (device_type == DeviceType::CPU) {
// Use memory_order_relaxed here since even if two threads race,
// they will still compute the same value for cpu_dispatch_ptr.
if (!cpu_dispatch_ptr.load(std::memory_order_relaxed)) {
FnPtr tmp_cpu_dispatch_ptr = nullptr;
while(!cpu_dispatch_ptr.compare_exchange_weak(
tmp_cpu_dispatch_ptr, choose_cpu_impl(), std::memory_order_relaxed));
}
return (*cpu_dispatch_ptr)(std::forward<ArgTypes>(args)...);
} else if (device_type == DeviceType::CUDA) {
AT_ASSERTM(cuda_dispatch_ptr, "DispatchStub: missing CUDA kernel");
return (*cuda_dispatch_ptr)(std::forward<ArgTypes>(args)...);
} else if (device_type == DeviceType::HIP) {
AT_ASSERTM(hip_dispatch_ptr, "DispatchStub: missing HIP kernel");
return (*hip_dispatch_ptr)(std::forward<ArgTypes>(args)...);
} else {
AT_ERROR("DispatchStub: unsupported device type", device_type);
}
}
FnPtr choose_cpu_impl() {
auto capability = static_cast<int>(get_cpu_capability());
(void)capability;
#ifdef HAVE_AVX2_CPU_DEFINITION
if (capability >= static_cast<int>(CPUCapability::AVX2)) {
AT_ASSERTM(AVX2, "DispatchStub: missing AVX2 kernel");
return AVX2;
}
#endif
#ifdef HAVE_AVX_CPU_DEFINITION
if (capability >= static_cast<int>(CPUCapability::AVX)) {
AT_ASSERTM(AVX, "DispatchStub: missing AVX kernel");
return AVX;
}
#endif
AT_ASSERTM(DEFAULT, "DispatchStub: missing default kernel");
return DEFAULT;
}
// Fixing dispatch error in Windows debug builds.
// See https://github.com/pytorch/pytorch/issues/22681 for more details.
#if defined(_MSC_VER) && defined(_DEBUG)
std::atomic<FnPtr> cpu_dispatch_ptr;
FnPtr cuda_dispatch_ptr;
FnPtr hip_dispatch_ptr;
#else
std::atomic<FnPtr> cpu_dispatch_ptr{nullptr};
FnPtr cuda_dispatch_ptr = nullptr;
FnPtr hip_dispatch_ptr = nullptr;
#endif
static FnPtr DEFAULT;
#ifdef HAVE_AVX_CPU_DEFINITION
static FnPtr AVX;
#endif
#ifdef HAVE_AVX2_CPU_DEFINITION
static FnPtr AVX2;
#endif
};
namespace {
template <typename FnPtr, typename T>
struct RegisterCUDADispatch {
RegisterCUDADispatch(DispatchStub<FnPtr, T>& stub, FnPtr value) {
stub.cuda_dispatch_ptr = value;
}
};
template <typename FnPtr, typename T>
struct RegisterHIPDispatch {
RegisterHIPDispatch(DispatchStub<FnPtr, T>& stub, FnPtr value) {
// TODO: make this point at hip_dispatch_ptr
stub.cuda_dispatch_ptr = value;
}
};
} // anonymous namespace
// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
// adding parentheses and using helper struct to get rid of the parentheses, do
// not work with MSVC. So do a `using`-declaration if you need to pass in such
// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
#define DECLARE_DISPATCH(fn, name) \
struct name : DispatchStub<fn, name> { \
name() = default; \
name(const name&) = delete; \
name& operator=(const name&) = delete; \
}; \
extern CAFFE2_API struct name name
#define DEFINE_DISPATCH(name) struct name name
#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
template <> decltype(fn) DispatchStub<decltype(fn), struct name>::arch = fn;
#ifdef HAVE_AVX_CPU_DEFINITION
#define REGISTER_AVX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX, fn)
#else
#define REGISTER_AVX_DISPATCH(name, fn)
#endif
#ifdef HAVE_AVX2_CPU_DEFINITION
#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
#else
#define REGISTER_AVX2_DISPATCH(name, fn)
#endif
#define REGISTER_NO_CPU_DISPATCH(name, fn_type) \
REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast<fn_type>(nullptr)) \
REGISTER_AVX_DISPATCH(name, static_cast<fn_type>(nullptr)) \
REGISTER_AVX2_DISPATCH(name, static_cast<fn_type>(nullptr))
#define REGISTER_CUDA_DISPATCH(name, fn) \
static RegisterCUDADispatch<decltype(fn), struct name> name ## __register(name, fn);
#define REGISTER_HIP_DISPATCH(name, fn) \
static RegisterHIPDispatch<decltype(fn), struct name> name ## __register(name, fn);
// NB: This macro must be used in an actual 'cu' file; if you try using
// it from a 'cpp' file it will not work!
#if defined(__CUDACC__)
#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
#elif defined(__HIPCC__)
// TODO: cut this over to HIP dispatch once we stop pretending that CUDA
// is HIP in the PyTorch HIPify build.
#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn)
#elif defined(CPU_CAPABILITY)
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
#endif
}} // namespace at::native
#if defined(__clang__)
#pragma clang diagnostic pop
#endif