Problem Statement
Can PyOpenCL be used to improve the performance of the ellipse fit problem.
Discussion
The objection function can be recoded using PyOpenCL. This module simplifies experimentation with global and local memory and it simplifies selecting the calculation precision.
To test this get the other code snippets from here. Install all of the modules in a directory, then run this module.
objective_pyopencl.py
''' This module contains an objective function for the ellipse fitting problem. The objective is coded using vectorized operations. ''' __author__ = 'Ed Tate' __email__ = 'edtate<at>gmail-dot-com' __website__ = 'exnumerus.blogspot.com' __license__ = 'Creative Commons Attribute By - http://creativecommons.org/licenses/by/3.0/us/''' from numpy import * import pyopencl as cl from string import Template # local imports import objective_scalar class Objective(objective_scalar.Objective): def __init__(self,parameters): objective_scalar.Objective.__init__(self,parameters) precision_dict = { 'float':float32, 'double':float64} self.platform_name = parameters.get('platform_name',None) self.platform_precision = parameters.get('platform_precision','double') self.partial_sum_loc = parameters.get('partial_sum_loc','__global') self.numpy_prec = precision_dict[self.platform_precision] openCL_prec = self.platform_precision point_list = self._p x,y = zip(*point_list) self.x = array(x,dtype=self.numpy_prec) self.y = array(y,dtype=self.numpy_prec) self.f1 = array([0,0],copy=True,dtype=self.numpy_prec) self.f2 = array([0,0],copy=True,dtype=self.numpy_prec) self.a = array([0],copy=True,dtype=self.numpy_prec) self.num_elements = len(x) platforms = cl.get_platforms() if len(platforms)>1 and not self.platform_name is None: # select the the first device in list for platform in platforms: if self.platform_name in platform.name: # use first device for context for device in platform.get_devices(): self.ctx = cl.Context([device]) self.device = device break break else: self.ctx = cl.create_some_context() if 'NVIDIA' in self.platform_name: compiler_options = '-cl-mad-enable -cl-fast-relaxed-math' else: compiler_options = '' #self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) # define the input buffers and transfer data into them mf = cl.mem_flags self.x_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.x) self.y_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.y) self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1) self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2) self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a) # define the output buffer self.partial_sum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, self.x.nbytes) self.full_sum_buf = cl.Buffer(self.ctx, mf.READ_ONLY, array([0],dtype=self.numpy_prec).nbytes) # prep the container to the output self.partial_sum = empty(self.x.shape,dtype=self.numpy_prec) self.full_sum = empty(array([0],dtype=self.numpy_prec).shape,dtype=self.numpy_prec) # define local memory self.local_partial_sum = cl.LocalMemory(self.x.nbytes) code_pattern= Template(""" #pragma OPENCL EXTENSION cl_khr_fp64: enable __kernel void partial_sum( __global const ${precision} *x, __global const ${precision} *y, __global const ${precision} *f1, __global const ${precision} *f2, __global const ${precision} *a, ${partial_sum_loc} ${precision} *partial_sum) { int gid = get_global_id(0); ${precision} x_f1_diff = x[gid]-f1[0]; ${precision} x_f2_diff = x[gid]-f2[0]; ${precision} y_f1_diff = y[gid]-f1[1]; ${precision} y_f2_diff = y[gid]-f2[1]; ${precision} x_f1_diff_sq = x_f1_diff*x_f1_diff; ${precision} x_f2_diff_sq = x_f2_diff*x_f2_diff; ${precision} y_f1_diff_sq = y_f1_diff*y_f1_diff; ${precision} y_f2_diff_sq = y_f2_diff*y_f2_diff; ${precision} norm_pt_to_f1 = sqrt(x_f1_diff_sq+y_f1_diff_sq); ${precision} norm_pt_to_f2 = sqrt(x_f2_diff_sq+y_f2_diff_sq); ${precision} t1 = norm_pt_to_f1+norm_pt_to_f2-2*a[0]; partial_sum[gid]=t1*t1; } __kernel void full_sum( ${partial_sum_loc} ${precision} *partial_sum, __global ${precision} *full_sum) { long i; ${precision} sum=0; for(i=0; i<${num_elements}; i++) { sum += partial_sum[i]; } full_sum[0] = sum; } __kernel void full_sum2( __global ${precision} *partial_sum) { int gid = get_global_id(0); long i; long mid_point = (${num_elements}+1)/2; if (gid>mid_point) { return; } else { partial_sum[gid]+=partial_sum[gid+mid_point]; } } """) substitutions = {'precision':openCL_prec, 'num_elements':self.num_elements, 'partial_sum_loc':self.partial_sum_loc} prg = code_pattern.substitute(substitutions) # define the function to be executed self.prg = cl.Program(self.ctx, prg).build(options=compiler_options) def __call__(self,x): ''' Calculate the objective cost in the optimization problem. ''' a = x[0] self.a = array([x[0]],copy=False,dtype=self.numpy_prec) self.f1 = array([x[1],x[2]],copy=False,dtype=self.numpy_prec) self.f2 = array([x[3],x[4]],copy=False,dtype=self.numpy_prec) # trigger transfer of values to the GPU buffers # only those values which changed before this evaluation need to be updated mf = cl.mem_flags self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1) self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2) self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a) # execute the partial_sum function if self.partial_sum_loc == '__global': self.prg.partial_sum(self.queue,self.x.shape,(8,), self.x_buf,self.y_buf,self.f1_buf,self.f2_buf,self.a_buf,self.partial_sum_buf) # trigger the copy of the results into the partial_sum container after completion cl.enqueue_read_buffer(self.queue, self.partial_sum_buf, self.partial_sum).wait() # execute the full_sum function self.prg.full_sum(self.queue,(1,),None, self.partial_sum_buf,self.full_sum_buf) else: self.prg.partial_sum(self.queue,self.x.shape,(8,), self.x_buf,self.y_buf,self.f1_buf,self.f2_buf, self.a_buf,self.local_partial_sum) self.prg.full_sum(self.queue,(1,),None, self.local_partial_sum,self.full_sum_buf) # trigger the copy of the results into the partial_sum container after completion cl.enqueue_read_buffer(self.queue, self.full_sum_buf, self.full_sum).wait() # compare sums print (sum(self.full_sum)-sum(self.partial_sum)) print (sum(self.full_sum),sum(self.partial_sum)) # finish up the calculations n = float(len(point_list)) _lambda =0.1 #full_sum = sum(self.partial_sum) total = (self.full_sum/float(n) + _lambda*self._ahat_max*self._sigma*exp((a/self._ahat_max)**4)) return total def _del_(self): pass if __name__=='__main__': import time from random import uniform,normalvariate,seed # local modules import ellipse #################################################################### # setup test conditions num_reps = 100 num_pts = 256 precision = 'float' seed(1234567890) # set the random generator to get repeatable results # setup the reference ellipse # define the foci locations foci1_ref = array([2,-1]) foci2_ref = array([-2,1]) # define distance from foci to ellipse circumference a_ref = 2.5 point_list = ellipse.generate_point_list(num_pts,a_ref,foci1_ref,foci2_ref) parameters = { "point_list" : point_list, "platform_name" : 'NVIDIA', "platform_precisions" : precision} # test the function on an NVIDIA card t0 = time.time() my_objective = Objective(parameters) x = my_objective.x0 t1 = time.time() for i in range(0,num_reps): y = my_objective(x) t2 = time.time() print '' print 'NVIDIA initialization took %f sec' % (t1-t0) print 'NVIDIA execution took %f sec' % (t2-t1) print 'Executed %i times on %s' % (num_reps,my_objective.device.name) print '' # test the function on an intel CPU parameters = { "point_list" : point_list, "platform_name" : 'Intel', "platform_precisions" : precision} t0 = time.time() my_objective = Objective(parameters) x = my_objective.x0 t1 = time.time() for i in range(0,num_reps): y = my_objective(x) t2 = time.time() print '' print 'Intel initialization took %f sec' % (t1-t0) print 'Intel execution took %f sec' % (t2-t1) print 'Executed %i times on %s' % (num_reps,my_objective.device.name) print ''
Next Steps:
- Integrate with the ellipse fit problem
- Explain the behavior of the code example
- Measure performance wrt scalar and vectorized solutions
No comments:
Post a Comment