harvard-cs205 · jacobbaron · Nov 7, 2015 · Nov 20, 2015 · Nov 20, 2015 · Nov 20, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,24 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, z_real_temp;
     int iter;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        iter=0;
+        z_real=coords_real[y * w + x];
+        z_imag=coords_imag[y * w + x];
+        c_real=coords_real[y * w + x];
+        c_imag=coords_imag[y * w + x];
+        //printf("%f",z_real);
+        while (((z_real*z_real+z_imag*z_imag)<4) && (iter<max_iter)) {
+          z_real_temp=z_real*z_real-z_imag*z_imag+c_real;
+          z_imag=2*z_real*z_imag+c_imag;
+          z_real=z_real_temp;
+          iter=iter+1;
+        }
+        out_counts[y * w + x]=iter;
+        //printf("%i",iter);
     }
 }
diff --git a/HW3/P2/mandelbrot.py b/HW3/P2/mandelbrot.py
@@ -2,7 +2,9 @@
 import pyopencl as cl
 import numpy as np
 import pylab
-
+import os
+import pdb
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def round_up(global_size, group_size):
     r = global_size % group_size
     if r == 0:
@@ -45,7 +47,7 @@ def make_coords(center=(-0.575 - 0.575j),
 
     # Create a queue for transferring data and launching computations.
     # Turn on profiling to allow us to check event times.
-    queue = cl.CommandQueue(context, context.devices[0],
+    queue = cl.CommandQueue(context, context.devices[1],
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
@@ -55,16 +57,17 @@ def make_coords(center=(-0.575 - 0.575j),
     real_coords = np.real(in_coords).copy()
     imag_coords = np.imag(in_coords).copy()
 
+
     gpu_real = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
     gpu_imag = cl.Buffer(context, cl.mem_flags.READ_ONLY, real_coords.size * 4)
     gpu_counts = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, real_coords.size * 4)
 
     local_size = (8, 8)  # 64 pixels per work group
     global_size = tuple([round_up(g, l) for g, l in zip(in_coords.shape[::-1], local_size)])
+    pdb.set_trace()
     width = np.int32(in_coords.shape[1])
     height = np.int32(in_coords.shape[0])
     max_iters = np.int32(1024)
-
     cl.enqueue_copy(queue, gpu_real, real_coords, is_blocking=False)
     cl.enqueue_copy(queue, gpu_imag, imag_coords, is_blocking=False)
 
@@ -73,7 +76,6 @@ def make_coords(center=(-0.575 - 0.575j),
                                width, height, max_iters)
 
     cl.enqueue_copy(queue, out_counts, gpu_counts, is_blocking=True)
-
     seconds = (event.profile.end - event.profile.start) / 1e9
     print("{} Million Complex FMAs in {} seconds, {} million Complex FMAs / second".format(out_counts.sum() / 1e6, seconds, (out_counts.sum() / seconds) / 1e6))
     pylab.imshow(np.log(out_counts))

diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,5 @@
+For my machine, any configurations with >32 workers resulted in an error. It also seems that when the gloabl size>4096, then the computation does not complete correctly in the blocked reads case (I see an assertion error). 
+
+With that in mind, the best performance was achieved with the greatest number of workgroups and workers
+coalesced reads, workgroups: 512, num_workers: 32, 0.013397824 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.03992816 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,13 +5,17 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    int global_id=get_global_id(0); 
+    //float local_size=get_local_size(0);
+    //int lim=(int)log2(local_size);
+    int i,j,offset;
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (i=global_id;i<N;i=i+get_global_size(0)) { // YOUR CODE HERE
+        sum=sum+x[i]; 
     }
-
+    //printf("thread[%d], sum=%f \n",global_id,sum);
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -24,8 +28,15 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    j=1;
+    offset=get_local_size(0)>>j;
+    while(offset>1) {
+    //for (j = 1; j <= lim; j++) { // YOUR CODE HERE
+        offset=get_local_size(0)>>j;
+        //printf("global_id[%d], offset=%d\n",global_id,offset);
+        fast[local_id]=fast[local_id]+fast[local_id+offset];   
+        barrier(CLK_LOCAL_MEM_FENCE);
+        j++;
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +49,12 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int global_id = get_global_id(0);
+    //float local_size=get_local_size(0);
+    int k = ceil((float)(N) / get_global_size(0));
+    //int lim=(int)log2(local_size);
+    int i,j,offset;
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,13 +64,14 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (i=global_id*k; i <= (k*(global_id+1)-1); i++) { // YOUR CODE HERE
+        sum=sum+x[i]; 
     }
-
+
+
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
-
+    //printf("local size=%f, lim=%d \n",local_size,lim);
     // binary reduction
     //
     // thread i should sum fast[i] and fast[i + offset] and store back
@@ -64,9 +81,18 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    j=1;
+    offset=get_local_size(0)>>j;
+    while(offset>1) {
+    //for (j = 1; j <= lim; j++) { // YOUR CODE HERE
+        offset=get_local_size(0)>>j;
+        //printf("global_id[%d], offset=%d\n",global_id,offset);
+        fast[local_id]=fast[local_id]+fast[local_id+offset];   
+        barrier(CLK_LOCAL_MEM_FENCE);
+        j++;
     }
 
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+    if (local_id == 0) {
+        partial[get_group_id(0)] = fast[0];
+    }
 }
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -1,6 +1,8 @@
 import pyopencl as cl
 import numpy as np
-
+import pdb
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 def create_data(N):
     return host_x, x
 
@@ -13,7 +15,7 @@ def create_data(N):
         print("#{0}: {1} on {2}".format(i, d.name, d.platform.name))
     ctx = cl.Context(devices)
 
-    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    queue = cl.CommandQueue(ctx, ctx.devices[1], properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     program = cl.Program(ctx, open('sum.cl').read()).build(options='')
 
@@ -25,15 +27,17 @@ def create_data(N):
     for num_workgroups in 2 ** np.arange(3, 10):
         partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
-        for num_workers in 2 ** np.arange(2, 8):
+        for num_workers in 2 ** np.arange(2, 6):
             local = cl.LocalMemory(num_workers * 4)
+            #pdb.set_trace()
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
 
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+            #pdb.set_trace()
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['coalesced', num_workgroups, num_workers] = seconds
             print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
@@ -42,15 +46,18 @@ def create_data(N):
     for num_workgroups in 2 ** np.arange(3, 10):
         partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
-        for num_workers in 2 ** np.arange(2, 8):
+        for num_workers in 2 ** np.arange(2, 6):
+            print("global_size={}".format(num_workers*num_workgroups))
             local = cl.LocalMemory(num_workers * 4)
             event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,),
                                         x, partial_sums, local, np.uint64(N))
+            #pdb.set_trace()
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
 
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
+
             assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
             times['blocked', num_workgroups, num_workers] = seconds
             print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,4 +1,76 @@
-#include "median9.h"
+//#include "median9.h"
+
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) < (b)) ? (b) : (a))
+
+#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp
+
+inline float median9(float s0, float s1, float s2,
+                     float s3, float s4, float s5,
+                     float s6, float s7, float s8)
+{
+    // http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html
+    float tmp;
+
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s0, s1);
+    cas(s3, s4);
+    cas(s6, s7);
+
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s3, s6);
+    cas(s4, s7);
+    cas(s5, s8);
+    cas(s0, s3);
+
+    cas(s1, s4);
+    cas(s2, s5);
+    cas(s3, s6);
+
+    cas(s4, s7);
+    cas(s1, s3);
+
+    cas(s2, s6);
+
+    cas(s2, s3);
+    cas(s4, s6);
+
+    cas(s3, s4);
+
+    return s4;
+}
+
+//Return image value at point (x,y), where the indicies are relative to the global image. If the indicies are out of bounds, choose closest in bounds value instead. 
+inline float fetch_point(__global __read_only float *img, 
+                   int w, int h,
+                   int x, int y)
+{
+    float out_img;
+    while (x<0) {
+        x++;
+    }
+    while (x>w-1){
+        x--;
+    }
+    while (y<0) {
+        y++;
+    }
+    while (y>h-1){
+        y--;
+    }
+    out_img=img[w*y+x];
+
+    return out_img;
+
+}
+
 
 // 3x3 median filter
 __kernel void
@@ -13,22 +85,66 @@ median_3x3(__global __read_only float *in_values,
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
 
+    //store calculated median here before transferring 
+    float out_buffer; 
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    int row;
+
+    if (x < w && y < h){
     // Load into buffer (with 1-pixel halo).
     //
     // It may be helpful to consult HW3 Problem 5, and
     // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
     //
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
+        if (idx_1D < buf_w) {
+            for (row = 0; row < buf_h; row++) {
+                buffer[row * buf_w + idx_1D] = \
+                    fetch_point(in_values, w, h,
+                          buf_corner_x + idx_1D,
+                          buf_corner_y + row);
+          }
+        }
 
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+        // Compute 3x3 median for each pixel in core (non-halo) pixels
+        //
+        // We've given you median9.h, and included it above, so you can
+        // use the median9() function.
+
+        //core point is buffer[(ly+halo)*buf_w+(lx+halo)]. For example, for (lx,ly)=(0,0), their value in the buffer should be (1,1). In the variables below, bc stands for "buffer core."
+        int bc_x=lx+halo;
+        int bc_y=ly+halo;
+        out_buffer = median9(buffer[(bc_y-1)*buf_w+bc_x-1],
+                             buffer[(bc_y-1)*buf_w+bc_x],
+                             buffer[(bc_y-1)*buf_w+bc_x+1],
+                             buffer[(bc_y)*buf_w+bc_x-1],
+                             buffer[(bc_y)*buf_w+bc_x],
+                             buffer[(bc_y)*buf_w+bc_x+1],
+                             buffer[(bc_y+1)*buf_w+bc_x-1],
+                             buffer[(bc_y+1)*buf_w+bc_x],
+                             buffer[(bc_y+1)*buf_w+bc_x+1]);
 
 
+        out_values[y*w+x]=out_buffer;
+  }
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
 }