-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hw3 #424
base: master
Are you sure you want to change the base?
Hw3 #424
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
For my machine, any configurations with >32 workers resulted in an error. It also seems that when the gloabl size>4096, then the computation does not complete correctly in the blocked reads case (I see an assertion error). | ||
|
||
With that in mind, the best performance was achieved with the greatest number of workgroups and workers | ||
coalesced reads, workgroups: 512, num_workers: 32, 0.013397824 seconds | ||
blocked reads, workgroups: 128, num_workers: 16, 0.03992816 seconds |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,17 @@ __kernel void sum_coalesced(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
int global_id=get_global_id(0); | ||
//float local_size=get_local_size(0); | ||
//int lim=(int)log2(local_size); | ||
int i,j,offset; | ||
|
||
// thread i (i.e., with i = get_global_id()) should add x[i], | ||
// x[i + get_global_size()], ... up to N-1, and store in sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (i=global_id;i<N;i=i+get_global_size(0)) { // YOUR CODE HERE | ||
sum=sum+x[i]; | ||
} | ||
|
||
//printf("thread[%d], sum=%f \n",global_id,sum); | ||
fast[local_id] = sum; | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
|
||
|
@@ -24,8 +28,15 @@ __kernel void sum_coalesced(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
j=1; | ||
offset=get_local_size(0)>>j; | ||
while(offset>1) { | ||
//for (j = 1; j <= lim; j++) { // YOUR CODE HERE | ||
offset=get_local_size(0)>>j; | ||
//printf("global_id[%d], offset=%d\n",global_id,offset); | ||
fast[local_id]=fast[local_id]+fast[local_id+offset]; | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
j++; | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
|
@@ -38,7 +49,12 @@ __kernel void sum_blocked(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
int k = ceil(float(N) / get_global_size(0)); | ||
int global_id = get_global_id(0); | ||
//float local_size=get_local_size(0); | ||
int k = ceil((float)(N) / get_global_size(0)); | ||
//int lim=(int)log2(local_size); | ||
int i,j,offset; | ||
|
||
|
||
// thread with global_id 0 should add 0..k-1 | ||
// thread with global_id 1 should add k..2k-1 | ||
|
@@ -48,13 +64,14 @@ __kernel void sum_blocked(__global float* x, | |
// | ||
// Be careful that each thread stays in bounds, both relative to | ||
// size of x (i.e., N), and the range it's assigned to sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (i=global_id*k; i <= (k*(global_id+1)-1); i++) { // YOUR CODE HERE | ||
sum=sum+x[i]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should verify that you access only valid elements from x (i.e., that i < N). |
||
} | ||
|
||
|
||
|
||
fast[local_id] = sum; | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
|
||
//printf("local size=%f, lim=%d \n",local_size,lim); | ||
// binary reduction | ||
// | ||
// thread i should sum fast[i] and fast[i + offset] and store back | ||
|
@@ -64,9 +81,18 @@ __kernel void sum_blocked(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
j=1; | ||
offset=get_local_size(0)>>j; | ||
while(offset>1) { | ||
//for (j = 1; j <= lim; j++) { // YOUR CODE HERE | ||
offset=get_local_size(0)>>j; | ||
//printf("global_id[%d], offset=%d\n",global_id,offset); | ||
fast[local_id]=fast[local_id]+fast[local_id+offset]; | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
j++; | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
if (local_id == 0) { | ||
partial[get_group_id(0)] = fast[0]; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,76 @@ | ||
#include "median9.h" | ||
//#include "median9.h" | ||
|
||
|
||
#define min(a, b) (((a) < (b)) ? (a) : (b)) | ||
#define max(a, b) (((a) < (b)) ? (b) : (a)) | ||
|
||
#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp | ||
|
||
inline float median9(float s0, float s1, float s2, | ||
float s3, float s4, float s5, | ||
float s6, float s7, float s8) | ||
{ | ||
// http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html | ||
float tmp; | ||
|
||
cas(s1, s2); | ||
cas(s4, s5); | ||
cas(s7, s8); | ||
|
||
cas(s0, s1); | ||
cas(s3, s4); | ||
cas(s6, s7); | ||
|
||
cas(s1, s2); | ||
cas(s4, s5); | ||
cas(s7, s8); | ||
|
||
cas(s3, s6); | ||
cas(s4, s7); | ||
cas(s5, s8); | ||
cas(s0, s3); | ||
|
||
cas(s1, s4); | ||
cas(s2, s5); | ||
cas(s3, s6); | ||
|
||
cas(s4, s7); | ||
cas(s1, s3); | ||
|
||
cas(s2, s6); | ||
|
||
cas(s2, s3); | ||
cas(s4, s6); | ||
|
||
cas(s3, s4); | ||
|
||
return s4; | ||
} | ||
|
||
//Return image value at point (x,y), where the indicies are relative to the global image. If the indicies are out of bounds, choose closest in bounds value instead. | ||
inline float fetch_point(__global __read_only float *img, | ||
int w, int h, | ||
int x, int y) | ||
{ | ||
float out_img; | ||
while (x<0) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is unclear why you use a while loop, if what you are aiming for is setting the value of 'x' to 0 if it is less than 0. |
||
x++; | ||
} | ||
while (x>w-1){ | ||
x--; | ||
} | ||
while (y<0) { | ||
y++; | ||
} | ||
while (y>h-1){ | ||
y--; | ||
} | ||
out_img=img[w*y+x]; | ||
|
||
return out_img; | ||
|
||
} | ||
|
||
|
||
// 3x3 median filter | ||
__kernel void | ||
|
@@ -13,22 +85,66 @@ median_3x3(__global __read_only float *in_values, | |
// without using the local buffer, first, then adjust your code to | ||
// use such a buffer after you have that working. | ||
|
||
//store calculated median here before transferring | ||
float out_buffer; | ||
|
||
// Global position of output pixel | ||
const int x = get_global_id(0); | ||
const int y = get_global_id(1); | ||
|
||
// Local position relative to (0, 0) in workgroup | ||
const int lx = get_local_id(0); | ||
const int ly = get_local_id(1); | ||
|
||
// coordinates of the upper left corner of the buffer in image | ||
// space, including halo | ||
const int buf_corner_x = x - lx - halo; | ||
const int buf_corner_y = y - ly - halo; | ||
|
||
const int idx_1D = ly * get_local_size(0) + lx; | ||
|
||
int row; | ||
|
||
if (x < w && y < h){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Adding this line here may cause a partial population of the buffer (for example, without the column that is to the right of the right-most column), but then when reading from the buffer (lines 135-143), you may read an invalid value. |
||
// Load into buffer (with 1-pixel halo). | ||
// | ||
// It may be helpful to consult HW3 Problem 5, and | ||
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl | ||
// | ||
// Note that globally out-of-bounds pixels should be replaced | ||
// with the nearest valid pixel's value. | ||
if (idx_1D < buf_w) { | ||
for (row = 0; row < buf_h; row++) { | ||
buffer[row * buf_w + idx_1D] = \ | ||
fetch_point(in_values, w, h, | ||
buf_corner_x + idx_1D, | ||
buf_corner_y + row); | ||
} | ||
} | ||
|
||
barrier(CLK_LOCAL_MEM_FENCE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A barrier should be performed by all threads (not only the ones that enter the if statement). |
||
|
||
// Compute 3x3 median for each pixel in core (non-halo) pixels | ||
// | ||
// We've given you median9.h, and included it above, so you can | ||
// use the median9() function. | ||
// Compute 3x3 median for each pixel in core (non-halo) pixels | ||
// | ||
// We've given you median9.h, and included it above, so you can | ||
// use the median9() function. | ||
|
||
//core point is buffer[(ly+halo)*buf_w+(lx+halo)]. For example, for (lx,ly)=(0,0), their value in the buffer should be (1,1). In the variables below, bc stands for "buffer core." | ||
int bc_x=lx+halo; | ||
int bc_y=ly+halo; | ||
out_buffer = median9(buffer[(bc_y-1)*buf_w+bc_x-1], | ||
buffer[(bc_y-1)*buf_w+bc_x], | ||
buffer[(bc_y-1)*buf_w+bc_x+1], | ||
buffer[(bc_y)*buf_w+bc_x-1], | ||
buffer[(bc_y)*buf_w+bc_x], | ||
buffer[(bc_y)*buf_w+bc_x+1], | ||
buffer[(bc_y+1)*buf_w+bc_x-1], | ||
buffer[(bc_y+1)*buf_w+bc_x], | ||
buffer[(bc_y+1)*buf_w+bc_x+1]); | ||
|
||
|
||
out_values[y*w+x]=out_buffer; | ||
} | ||
// Each thread in the valid region (x < w, y < h) should write | ||
// back its 3x3 neighborhood median. | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you compare this to the code in the link above, you'll see that an if statement is missing.
This causes 2 issues:
This may also explain why you saw an error when using more than 32 threads, or when the global size was larger than 4096.