Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #424

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open

Hw3 #424

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,24 @@ mandelbrot(__global __read_only float *coords_real,
const int y = get_global_id(1);

float c_real, c_imag;
float z_real, z_imag;
float z_real, z_imag, z_real_temp;
int iter;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
iter=0;
z_real=coords_real[y * w + x];
z_imag=coords_imag[y * w + x];
c_real=coords_real[y * w + x];
c_imag=coords_imag[y * w + x];
//printf("%f",z_real);
while (((z_real*z_real+z_imag*z_imag)<4) && (iter<max_iter)) {
z_real_temp=z_real*z_real-z_imag*z_imag+c_real;
z_imag=2*z_real*z_imag+c_imag;
z_real=z_real_temp;
iter=iter+1;
}
out_counts[y * w + x]=iter;
//printf("%i",iter);
}
}
10 changes: 6 additions & 4 deletions HW3/P2/mandelbrot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import pyopencl as cl
import numpy as np
import pylab

import os
import pdb
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
def round_up(global_size, group_size):
r = global_size % group_size
if r == 0:
Expand Down Expand Up @@ -45,7 +47,7 @@ def make_coords(center=(-0.575 - 0.575j),

# Create a queue for transferring data and launching computations.
# Turn on profiling to allow us to check event times.
queue = cl.CommandQueue(context, context.devices[0],
queue = cl.CommandQueue(context, context.devices[1],
properties=cl.command_queue_properties.PROFILING_ENABLE)
print 'The queue is using the device:', queue.device.name

Expand All @@ -55,16 +57,17 @@ def make_coords(center=(-0.575 - 0.575j),
real_coords = np.real(in_coords).copy()
imag_coords = np.imag(in_coords).copy()


gpu_real = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
gpu_imag = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
gpu_counts = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, real_coords.size * 4)

local_size = (8, 8) # 64 pixels per work group
global_size = tuple([round_up(g, l) for g, l in zip(in_coords.shape[::-1], local_size)])
pdb.set_trace()
width = np.int32(in_coords.shape[1])
height = np.int32(in_coords.shape[0])
max_iters = np.int32(1024)

cl.enqueue_copy(queue, gpu_real, real_coords, is_blocking=False)
cl.enqueue_copy(queue, gpu_imag, imag_coords, is_blocking=False)

Expand All @@ -73,7 +76,6 @@ def make_coords(center=(-0.575 - 0.575j),
width, height, max_iters)

cl.enqueue_copy(queue, out_counts, gpu_counts, is_blocking=True)

seconds = (event.profile.end - event.profile.start) / 1e9
print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out_counts.sum() / 1e6, seconds, (out_counts.sum() / seconds) / 1e6))
pylab.imshow(np.log(out_counts))
Expand Down
5 changes: 5 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
For my machine, any configurations with >32 workers resulted in an error. It also seems that when the gloabl size>4096, then the computation does not complete correctly in the blocked reads case (I see an assertion error).

With that in mind, the best performance was achieved with the greatest number of workgroups and workers
coalesced reads, workgroups: 512, num_workers: 32, 0.013397824 seconds
blocked reads, workgroups: 128, num_workers: 16, 0.03992816 seconds
52 changes: 39 additions & 13 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int global_id=get_global_id(0);
//float local_size=get_local_size(0);
//int lim=(int)log2(local_size);
int i,j,offset;

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (i=global_id;i<N;i=i+get_global_size(0)) { // YOUR CODE HERE
sum=sum+x[i];
}

//printf("thread[%d], sum=%f \n",global_id,sum);
fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

Expand All @@ -24,8 +28,15 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
j=1;
offset=get_local_size(0)>>j;
while(offset>1) {
//for (j = 1; j <= lim; j++) { // YOUR CODE HERE
offset=get_local_size(0)>>j;
//printf("global_id[%d], offset=%d\n",global_id,offset);
fast[local_id]=fast[local_id]+fast[local_id+offset];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you compare this to the code in the link above, you'll see that an if statement is missing.
This causes 2 issues:

  1. Some threads may access data that is beyond the end of the input array.
  2. The output may differ because in some iteration, given two threads A and B (where B_id = A_id + offset), B may update its value, and then A will read the new value.
    This may also explain why you saw an error when using more than 32 threads, or when the global size was larger than 4096.

barrier(CLK_LOCAL_MEM_FENCE);
j++;
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
Expand All @@ -38,7 +49,12 @@ __kernel void sum_blocked(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil(float(N) / get_global_size(0));
int global_id = get_global_id(0);
//float local_size=get_local_size(0);
int k = ceil((float)(N) / get_global_size(0));
//int lim=(int)log2(local_size);
int i,j,offset;


// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
Expand All @@ -48,13 +64,14 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (i=global_id*k; i <= (k*(global_id+1)-1); i++) { // YOUR CODE HERE
sum=sum+x[i];

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should verify that you access only valid elements from x (i.e., that i < N).

}



fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

//printf("local size=%f, lim=%d \n",local_size,lim);
// binary reduction
//
// thread i should sum fast[i] and fast[i + offset] and store back
Expand All @@ -64,9 +81,18 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
j=1;
offset=get_local_size(0)>>j;
while(offset>1) {
//for (j = 1; j <= lim; j++) { // YOUR CODE HERE
offset=get_local_size(0)>>j;
//printf("global_id[%d], offset=%d\n",global_id,offset);
fast[local_id]=fast[local_id]+fast[local_id+offset];
barrier(CLK_LOCAL_MEM_FENCE);
j++;
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
if (local_id == 0) {
partial[get_group_id(0)] = fast[0];
}
}
15 changes: 11 additions & 4 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pyopencl as cl
import numpy as np

import pdb
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
def create_data(N):
return host_x, x

Expand All @@ -13,7 +15,7 @@ def create_data(N):
print("#{0}: {1} on {2}".format(i, d.name, d.platform.name))
ctx = cl.Context(devices)

queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
queue = cl.CommandQueue(ctx, ctx.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE)

program = cl.Program(ctx, open('sum.cl').read()).build(options='')

Expand All @@ -25,15 +27,17 @@ def create_data(N):
for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
for num_workers in 2 ** np.arange(2, 6):
local = cl.LocalMemory(num_workers * 4)
#pdb.set_trace()
event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
x, partial_sums, local, np.uint64(N))
cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)

sum_gpu = sum(host_partial)
sum_host = sum(host_x)
seconds = (event.profile.end - event.profile.start) / 1e9
#pdb.set_trace()
assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
times['coalesced', num_workgroups, num_workers] = seconds
print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
Expand All @@ -42,15 +46,18 @@ def create_data(N):
for num_workgroups in 2 ** np.arange(3, 10):
partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
host_partial = np.empty(num_workgroups).astype(np.float32)
for num_workers in 2 ** np.arange(2, 8):
for num_workers in 2 ** np.arange(2, 6):
print("global_size={}".format(num_workers*num_workgroups))
local = cl.LocalMemory(num_workers * 4)
event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,),
x, partial_sums, local, np.uint64(N))
#pdb.set_trace()
cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)

sum_gpu = sum(host_partial)
sum_host = sum(host_x)
seconds = (event.profile.end - event.profile.start) / 1e9

assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
times['blocked', num_workgroups, num_workers] = seconds
print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
Expand Down
126 changes: 121 additions & 5 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,76 @@
#include "median9.h"
//#include "median9.h"


#define min(a, b) (((a) < (b)) ? (a) : (b))
#define max(a, b) (((a) < (b)) ? (b) : (a))

#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp

inline float median9(float s0, float s1, float s2,
float s3, float s4, float s5,
float s6, float s7, float s8)
{
// http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html
float tmp;

cas(s1, s2);
cas(s4, s5);
cas(s7, s8);

cas(s0, s1);
cas(s3, s4);
cas(s6, s7);

cas(s1, s2);
cas(s4, s5);
cas(s7, s8);

cas(s3, s6);
cas(s4, s7);
cas(s5, s8);
cas(s0, s3);

cas(s1, s4);
cas(s2, s5);
cas(s3, s6);

cas(s4, s7);
cas(s1, s3);

cas(s2, s6);

cas(s2, s3);
cas(s4, s6);

cas(s3, s4);

return s4;
}

//Return image value at point (x,y), where the indicies are relative to the global image. If the indicies are out of bounds, choose closest in bounds value instead.
inline float fetch_point(__global __read_only float *img,
int w, int h,
int x, int y)
{
float out_img;
while (x<0) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is unclear why you use a while loop, if what you are aiming for is setting the value of 'x' to 0 if it is less than 0.

x++;
}
while (x>w-1){
x--;
}
while (y<0) {
y++;
}
while (y>h-1){
y--;
}
out_img=img[w*y+x];

return out_img;

}


// 3x3 median filter
__kernel void
Expand All @@ -13,22 +85,66 @@ median_3x3(__global __read_only float *in_values,
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.

//store calculated median here before transferring
float out_buffer;

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

const int idx_1D = ly * get_local_size(0) + lx;

int row;

if (x < w && y < h){

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding this line here may cause a partial population of the buffer (for example, without the column that is to the right of the right-most column), but then when reading from the buffer (lines 135-143), you may read an invalid value.

// Load into buffer (with 1-pixel halo).
//
// It may be helpful to consult HW3 Problem 5, and
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.
if (idx_1D < buf_w) {
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
fetch_point(in_values, w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}
}

barrier(CLK_LOCAL_MEM_FENCE);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A barrier should be performed by all threads (not only the ones that enter the if statement).


// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.
// Compute 3x3 median for each pixel in core (non-halo) pixels
//
// We've given you median9.h, and included it above, so you can
// use the median9() function.

//core point is buffer[(ly+halo)*buf_w+(lx+halo)]. For example, for (lx,ly)=(0,0), their value in the buffer should be (1,1). In the variables below, bc stands for "buffer core."
int bc_x=lx+halo;
int bc_y=ly+halo;
out_buffer = median9(buffer[(bc_y-1)*buf_w+bc_x-1],
buffer[(bc_y-1)*buf_w+bc_x],
buffer[(bc_y-1)*buf_w+bc_x+1],
buffer[(bc_y)*buf_w+bc_x-1],
buffer[(bc_y)*buf_w+bc_x],
buffer[(bc_y)*buf_w+bc_x+1],
buffer[(bc_y+1)*buf_w+bc_x-1],
buffer[(bc_y+1)*buf_w+bc_x],
buffer[(bc_y+1)*buf_w+bc_x+1]);


out_values[y*w+x]=out_buffer;
}
// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.
}
Loading