forked from torch/cutorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
init.lua
153 lines (137 loc) · 5.65 KB
/
init.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
require "torch"
paths.require("libcutorch")
torch.CudaByteStorage.__tostring__ = torch.ByteStorage.__tostring__
torch.CudaByteTensor.__tostring__ = torch.ByteTensor.__tostring__
torch.CudaCharStorage.__tostring__ = torch.CharStorage.__tostring__
torch.CudaCharTensor.__tostring__ = torch.CharTensor.__tostring__
torch.CudaShortStorage.__tostring__ = torch.ShortStorage.__tostring__
torch.CudaShortTensor.__tostring__ = torch.ShortTensor.__tostring__
torch.CudaIntStorage.__tostring__ = torch.IntStorage.__tostring__
torch.CudaIntTensor.__tostring__ = torch.IntTensor.__tostring__
torch.CudaLongStorage.__tostring__ = torch.LongStorage.__tostring__
torch.CudaLongTensor.__tostring__ = torch.LongTensor.__tostring__
torch.CudaStorage.__tostring__ = torch.FloatStorage.__tostring__
torch.CudaTensor.__tostring__ = torch.FloatTensor.__tostring__
torch.CudaDoubleStorage.__tostring__ = torch.DoubleStorage.__tostring__
torch.CudaDoubleTensor.__tostring__ = torch.DoubleTensor.__tostring__
if cutorch.hasHalf then
torch.CudaHalfStorage.__tostring__ = torch.HalfStorage.__tostring__
torch.CudaHalfTensor.__tostring__ = torch.HalfTensor.__tostring__
end
require('cutorch.Tensor')
require('cutorch.FFI')
require('cutorch.test')
local unpack = unpack or table.unpack
function cutorch.withDevice(newDeviceID, closure)
local curDeviceID = cutorch.getDevice()
cutorch.setDevice(newDeviceID)
local vals = {pcall(closure)}
cutorch.setDevice(curDeviceID)
if vals[1] then
return unpack(vals, 2)
end
error(unpack(vals, 2))
end
local function longTensorSize(...)
local size
if not ... then
size = torch.LongTensor{0}
elseif torch.isStorage(...) then
size = torch.LongTensor(...)
else
size = torch.LongTensor{...}
end
return size
end
-- Creates a FloatTensor using the CudaHostAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
function cutorch.createCudaHostTensor(...)
local size = longTensorSize(...)
local storage = torch.FloatStorage(cutorch.CudaHostAllocator, size:prod())
return torch.FloatTensor(storage, 1, size:storage())
end
function cutorch.createCudaHostDoubleTensor(...)
local size = longTensorSize(...)
local storage = torch.DoubleStorage(cutorch.CudaHostAllocator, size:prod())
return torch.DoubleTensor(storage, 1, size:storage())
end
if cutorch.hasHalf then
function cutorch.createCudaHostHalfTensor(...)
local size = longTensorSize(...)
local storage = torch.HalfStorage(cutorch.CudaHostAllocator, size:prod())
return torch.HalfTensor(storage, 1, size:storage())
end
end
-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
local function _createUVATensor(...)
local size = longTensorSize(...)
-- See CUDA_C_Programming_guide.pdf for detailed explanation about synchronization
-- Section J.
-- "It is worth a comment on the synchronization between host and device. Notice how in
-- the non-managed example, the synchronous cudaMemcpy() routine is used both to
-- synchronize the kernel (that is, to wait for it to finish running), and to transfer the data
-- to the host. The Unified Memory examples do not call cudaMemcpy() and so require an
-- explicit cudaDeviceSynchronize() before the host program can safely use the output
-- from the GPU."
-- Section J.2.2.1.
-- " Note that if memory is dynamically allocated with cudaMallocManaged() or
-- cuMemAllocManaged() while the GPU is active, the behavior of the memory is
-- unspecified until additional work is launched or the GPU is synchronized. Attempting
-- to access the memory on the CPU during this time may or may not cause a segmentation
-- fault."
cutorch.synchronize()
local storage = torch.FloatStorage(cutorch.CudaUVAAllocator, size:prod())
return torch.FloatTensor(storage)
end
function cutorch.createFloatUVATensor(...)
return _createUVATensor(...)
end
-- Creates a CudaTensor using the CudaUVAAllocator.
-- Accepts either a LongStorage or a sequence of numbers.
-- First creates a UVA backed FloatTensor and takes its pointer.
function cutorch.createCudaUVATensor(...)
-- Delegate actual allocation and synchronization to CPU tensor and
-- take the pointer.
local ft = _createUVATensor(...)
local storage = torch.CudaStorage(
ft:storage():size(),
tonumber(torch.data(ft:storage(), true))
)
return torch.CudaTensor(storage)
end
-- UVA storage is a single memory location backed by virtual addressing.
-- Converting between CPU / GPU tensor types is done by raw pointer passing.
-- We only support FloatTensor, CudaTensor, Cuda -> float and float -> Cuda atm
function cutorch.toFloatUVATensor(t)
if not torch.isTensor(t) then
error('Must use a tensor, got ' .. torch.type(t))
end
local storage = torch.FloatStorage(
t:storage():size(),
tonumber(torch.data(t:storage(), true))
)
assert(cutorch.isManaged(storage))
return torch.FloatTensor(storage)
end
function cutorch.toCudaUVATensor(t)
if not torch.isTensor(t) then
error('Must use a tensor, got ' .. torch.type(t))
end
local storage = torch.CudaStorage(
t:storage():size(),
tonumber(torch.data(t:storage(), true))
)
assert(cutorch.isManaged(storage))
return torch.CudaTensor(storage)
end
function cutorch.isManaged(t)
if not torch.isTensor(t) and not torch.isStorage(t) then
error('Usage: cutorch.isManaged(Tensor|Storage), got ' .. torch.type(t))
end
return cutorch.isManagedPtr(tonumber(torch.data(t, true)))
end
-- remove this line to disable automatic cutorch heap-tracking
-- for garbage collection
cutorch.setHeapTracking(true)
return cutorch