-
Notifications
You must be signed in to change notification settings - Fork 0
/
lifeGrid.cu
303 lines (238 loc) · 10.3 KB
/
lifeGrid.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
* The kernel found in life32x32.cu modified to handle an array of blocks
* distributed over a grid. Note that the kernel uses bitmasks to perform the
* modulo operations to facilitae wrap around, so it is only possible to have
* grids with power of two dimensions.
*/
#include <chrono>
#include <cooperative_groups.h>
#include <cuda.h>
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#define GRID_X_LOG2 2
#define GRID_Y_LOG2 2
#define SLEEP_TIME 100000
#define GENERATION_STEP 1
const char* outputVector[4] = {(char*)" ", (char*)"▀", (char*)"▄", (char*)"█"};
__global__ void tiledLifeKernel(uint *cols, uint32_t numgenerations) {
// Allocate the local shared memory needed for the kernel. The memory will be
// arranged in a 34x34 array, with a one byte padding at the end of every row
// to prevent bank conflicts
__shared__ uint8_t cells[1190];
__shared__ uint32_t sidesData[2];
// Get the grid group that will be used to synchronize the entire kernel
// between generations
cooperative_groups::grid_group grid = cooperative_groups::this_grid();
// Got the indexes needed to << 24
uint16_t sideBlockX[2];
sideBlockX[0] = (blockIdx.x - 1) & (gridDim.x - 1);
sideBlockX[1] = (blockIdx.x + 1) & (gridDim.x - 1);
uint16_t topBlockY = (blockIdx.y + 1) & (gridDim.y - 1);
uint16_t bottomBlockY = (blockIdx.y - 1) & (gridDim.y - 1);
// Load the col index to a register
// TODO Hopefully won't run out of registers, but might want to banchmark this
// anyway
uint16_t colIdx = threadIdx.x;
// Load the data in the block itself
uint32_t colData =
cols[(((blockIdx.y * gridDim.x) + blockIdx.x) << 5) + colIdx];
for (uint8_t i = 0; i < 32; ++i) {
cells[(i + 1) * 35 + colIdx + 1] = (colData >> i) & 0x1;
}
// Begin performing the actual cellular automation portion of the kernel
for (uint32_t g = 0; g < numgenerations; ++g) {
// Each generation will need to reload all the neighboring cells
// #######################################################################
// ##### BEGIN WARP DIVERGENCE #####
// #######################################################################
// Read the four corners of adjacent to the block and sides of the block
// Note that this will cause warp divergence, but I can't think of a way to
// avoid it right now
if (colIdx < 2) {
uint8_t offset = 0x1f * (~colIdx & 0x1);
// Read amd save the top corners
colData =
cols[(((topBlockY * gridDim.x) + sideBlockX[colIdx]) << 5) +
offset];
cells[colIdx * 33] = colData >> 31;
// Read the sides, they will be saved later by individual threads
sidesData[colIdx] =
cols[(((blockIdx.y * gridDim.x) + sideBlockX[colIdx]) << 5) +
offset];
// Read and saved the bottom corners
colData =
cols[(((bottomBlockY * gridDim.x) + sideBlockX[colIdx]) << 5) +
offset];
cells[33 * 35 + colIdx * 33] = colData & 0b1;
}
// #######################################################################
// ##### END WARP DIVERGENCE #####
// #######################################################################
// Now that that divergent mess is done, sync everything back up
// TODO: figure out if this is strictly necessary
//__syncthreads();
// The left and right sides were loaded in in the divergent section, but
// they can be transfered to the memory array in parallel
// Store the left side to shared memory
uint32_t leftSide = sidesData[0];
cells[(colIdx + 1) * 35] = (leftSide >> colIdx) & 0x1;
// Store the right side to shared memory
uint32_t rightSide = sidesData[1];
cells[(colIdx + 1) * 35 + 33] = (rightSide >> colIdx) & 0x1;
// Now we can just load the rest of the data in by column
// Load in the neighbor above this column
uint32_t colData =
cols[(((topBlockY * gridDim.x) + blockIdx.x) << 5) + colIdx];
cells[colIdx + 1] = colData >> 31;
// Load in the neighbors below this column
colData =
cols[(((bottomBlockY * gridDim.x) + blockIdx.x) << 5) + colIdx];
cells[33 * 35 + colIdx + 1] = colData & 0x1;
__syncthreads();
// #######################################################################
// ###### END MEMORY LOADING ######
// #######################################################################
uint8_t lastSides = 0, lastMiddle = 0, thisSides = 0, thisMiddle = 0,
nextSides = 0, nextMiddle = 0;
// Get the neighbors in the previoous row
lastSides = cells[colIdx] & 0x1;
lastSides += cells[colIdx + 2] & 0x1;
lastMiddle = cells[colIdx + 1] & 0x1;
// Get the neighbors in the current row
thisSides = cells[35 + colIdx] & 0x1;
thisSides += cells[35 + colIdx + 2] & 0x1;
// Get the state of the current cell
thisMiddle = cells[35 + colIdx + 1] & 0x1;
for (int i = 1; i < 33; ++i) {
// Get the neighbors in the next row
nextSides = cells[(i + 1) * 35 + colIdx] & 0x1;
nextSides += cells[(i + 1) * 35 + colIdx + 2] & 0x1;
nextMiddle = cells[(i + 1) * 35 + colIdx + 1] & 0x1;
// Compute the total number of neighbors
uint8_t neighbors =
lastSides + lastMiddle + thisSides + nextSides + nextMiddle;
// Compute the next state of the cell
cells[i * 35 + colIdx + 1] |=
(~neighbors >> 1 & neighbors & (thisMiddle | neighbors) << 1) & 0x2;
// The current row will becom the next row, etc
lastSides = thisSides;
lastMiddle = thisMiddle;
thisSides = nextSides;
thisMiddle = nextMiddle;
}
// #######################################################################
// ##### END CELLULAR COMPUTATIONS #####
// #######################################################################
// Make sure all the threads have finished computing before continuing to
// the rest of the memory management
__syncthreads();
// Shift the next state of the cell into the current state of the cell
for (int i = 1; i < 33; ++i) {
cells[i * 35 + colIdx + 1] >>= 1;
}
// Write back the computed column data
// First, clear the register that will hold the compressed data
colData = 0x00000000;
// Then compress all the data into the register
for (int i = 0; i < 32; ++i) {
colData |= (cells[(i+1) * 35 + colIdx + 1] & 0x1) << i;
}
// Write the column data back to global memory so that other blocks can
// access it
cols[(((blockIdx.y * gridDim.x) + blockIdx.x) << 5) + colIdx] = colData;
// Synchronize all blocks in the kernel before starting on the next
// generation
grid.sync();
}
// #########################################################################
// ##### END LIFE KERNEL #####
// #########################################################################
}
void generateCells(uint32_t *cells, uint32_t length) {
// uint32_t seed = std::chrono::duration_cast<std::chrono::milliseconds>(
// std::chrono::system_clock::now().time_since_epoch())
// .count();
// srand(seed);
// for (uint32_t i = 0; i < length; ++i) {
// cells[i] = (rand() + rand() + rand()) & (rand() + rand() + rand()) & 0xFFFFFFFF;
// }
for (uint32_t i = 0; i < length; i += 8) {
cells[i] = 0b0001 << 24;
cells[i+1] = 0b0110 << 24;
cells[i+2] = 0b0011 << 24;
cells[i] |= 0b0001 << 16;
cells[i+1] |= 0b0110 << 16;
cells[i+2] |= 0b0011 << 16;
cells[i] |= 0b0001 << 8;
cells[i+1] |= 0b0110 << 8;
cells[i+2] |= 0b0011 << 8;
cells[i] |= 0b0001 << 0;
cells[i+1] |= 0b0110 << 0;
cells[i+2] |= 0b0011 << 0;
}
}
void drawCells(uint32_t *cells, int generation, uint32_t sleepTime, uint16_t gridDimXLog2,
uint16_t gridDimYLog2) {
printf("\033[H");
for (int b = 0; b < 1 << GRID_Y_LOG2; ++b) {
for (int y = 0; y < 32; y += 2) {
// printf("\n\033[1;%dH", y+1);
for (int x = 0; x < (1 << GRID_X_LOG2) * 32; ++x) {
uint8_t cellsInBlock = 0;
cellsInBlock = (cells[b * (1 << (gridDimXLog2 + 5)) + x] & (0x3l << y)) >> y;
// bool lowerCell = (cells[b * (1 << (gridDimXLog2 + 5)) + x] & (0x1l << (y+1)) >> (y+1);
// cellsInBlock |= 0x3 & (cells[b * (1 << (gridDimXLog2 + 5)) + x] & (0x1ull << y)) >> y;
printf(outputVector[cellsInBlock]);
}
printf("\n");
}
}
printf("%d ", generation);
usleep(sleepTime);
}
void launchKernel(uint32_t *cells, uint32_t numGenerations,
uint16_t gridDimXLog2, uint16_t gridDimYLog2) {
dim3 gridDim;
gridDim.x = 1 << gridDimXLog2;
gridDim.y = 1 << gridDimYLog2;
gridDim.z = 1;
dim3 blockDim;
blockDim.x = 32;
blockDim.y = 1;
blockDim.z = 1;
void **args = new void *[2];
args[0] = &cells;
args[1] = &numGenerations;
cudaLaunchCooperativeKernel((void *)tiledLifeKernel, gridDim, blockDim, args);
cudaDeviceSynchronize();
}
int main(int argc, char **argv) {
// Make sure that the CUDA device has compute capability >=6.0
// int supportsCooperativeLaunch = 0;
// CUdevice dev;
// cuDeviceGet(&dev, 0);
// cudaDeviceProp deviceProp;
// cudaGetDeviceProperties(&deviceProp, dev);
// cuDeviceGetAttribute(&supportsCooperativeLaunch,
// CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
// if (supportsCooperativeLaunch == 0) {
// std::cout
// << "Your CUDA device does not support the cooperative launch feature "
// << "required by this program (compute capability >=6.0 required, "
// << deviceProp.major << "." << deviceProp.minor << " detected)"
// << std::endl;
// return 1;
// }
uint32_t *cells;
cudaMallocManaged(&cells, sizeof(uint32_t) * 32 * (1<<GRID_X_LOG2) * (1<<GRID_Y_LOG2));
generateCells(cells, 32 * (1<<GRID_X_LOG2) * (1<<GRID_Y_LOG2));
drawCells(cells, 0, SLEEP_TIME, GRID_X_LOG2, GRID_Y_LOG2);
uint32_t generation = 1;
while(generation < 5 || true) {
launchKernel(cells, GENERATION_STEP, GRID_X_LOG2, GRID_Y_LOG2);
drawCells(cells, generation, SLEEP_TIME, GRID_X_LOG2, GRID_Y_LOG2);
generation += GENERATION_STEP;
}
cudaFree(cells);
}