-
Notifications
You must be signed in to change notification settings - Fork 10
/
types.cu
97 lines (92 loc) · 3.14 KB
/
types.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include "types.h"
#include "gnn.h"
#include "cuda_helper.h"
template<typename DT, int dim>
TensorAccessorR<DT, dim>::TensorAccessorR(PhysicalRegion region,
RegionRequirement req,
FieldID fid,
Context ctx,
Runtime* runtime,
ResourceManager* manager)
{
const AccessorRO<DT, dim> acc(region, fid);
rect = runtime->get_index_space_domain(
ctx, req.region.get_index_space());
assert(acc.accessor.is_dense_arbitrary(rect));
ptr = acc.ptr(rect);
std::set<Memory> memories;
region.get_memories(memories);
assert(memories.size() == 1);
memory = *memories.begin();
if (memory.kind() == Memory::GPU_FB_MEM) {
fbCache = NULL;
} else if (memory.kind() == Memory::Z_COPY_MEM) {
int id = manager->assign(region, rect.volume());
assert(id >= 0);
fbCache = (DT*) manager->fbCache[id].ptr;
checkCUDA(cudaMemcpyAsync(fbCache, ptr, rect.volume() * sizeof(DT),
cudaMemcpyHostToDevice));
} else {
assert(false);
}
}
template<typename DT>
__global__
void zero_array(DT* ptr, coord_t size)
{
CUDA_KERNEL_LOOP(i, size)
{
ptr[i] = 0;
}
}
template<typename DT, int dim>
TensorAccessorW<DT, dim>::TensorAccessorW(PhysicalRegion region,
RegionRequirement req,
FieldID fid,
Context ctx,
Runtime* runtime,
ResourceManager* manager,
bool readOutput)
{
rect = runtime->get_index_space_domain(
ctx, req.region.get_index_space());
if (readOutput) {
const AccessorRW<DT, dim> acc(region, fid);
assert(acc.accessor.is_dense_arbitrary(rect));
ptr = acc.ptr(rect);
} else {
const AccessorWO<DT, dim> acc(region, fid);
assert(acc.accessor.is_dense_arbitrary(rect));
ptr = acc.ptr(rect);
}
std::set<Memory> memories;
region.get_memories(memories);
assert(memories.size() == 1);
memory = *memories.begin();
if (memory.kind() == Memory::GPU_FB_MEM) {
fbCache = NULL;
} else if (memory.kind() == Memory::Z_COPY_MEM) {
int id = manager->assign(region, rect.volume());
assert(id >= 0);
fbCache = (DT*) manager->fbCache[id].ptr;
if (readOutput) {
checkCUDA(cudaMemcpyAsync(fbCache, ptr, rect.volume() * sizeof(DT),
cudaMemcpyHostToDevice));
} else {
// Currently we zero init the fbCache if not read output
zero_array<DT><<<GET_BLOCKS(rect.volume()), CUDA_NUM_THREADS>>>(
fbCache, rect.volume());
}
} else {
assert(false);
}
}
template class TensorAccessorR<NodeStruct, 1>;
template class TensorAccessorR<EdgeStruct, 1>;
template class TensorAccessorR<DATATYPE, 1>;
template class TensorAccessorR<DATATYPE, 2>;
template class TensorAccessorR<DATATYPE, 3>;
template class TensorAccessorR<int, 2>;
template class TensorAccessorW<DATATYPE, 1>;
template class TensorAccessorW<DATATYPE, 2>;
template class TensorAccessorW<DATATYPE, 3>;