Sunday, June 12, 2011

First Draft: Solving the ellipse fit problem using PyOpenCL–recoding the objective function

Problem Statement

Can PyOpenCL be used to improve the performance of the ellipse fit problem.

Discussion

The objection function can be recoded using PyOpenCL. This module simplifies experimentation with global and local memory and it simplifies selecting the calculation precision.

To test this get the other code snippets from here. Install all of the modules in a directory, then run this module.

objective_pyopencl.py

'''
This module contains an objective function for the ellipse fitting problem.
The objective is coded using vectorized operations.

'''

__author__ = 'Ed Tate'
__email__  = 'edtate<at>gmail-dot-com'
__website__ = 'exnumerus.blogspot.com'
__license__ = 'Creative Commons Attribute By - http://creativecommons.org/licenses/by/3.0/us/'''

from numpy import *
import pyopencl as cl
from string import Template

# local imports
import objective_scalar


class Objective(objective_scalar.Objective):

    def __init__(self,parameters):
        objective_scalar.Objective.__init__(self,parameters)
   
        precision_dict = {
            'float':float32,
            'double':float64}    
            
        self.platform_name      = parameters.get('platform_name',None)
        self.platform_precision = parameters.get('platform_precision','double') 
        self.partial_sum_loc    = parameters.get('partial_sum_loc','__global')           
            
        self.numpy_prec = precision_dict[self.platform_precision]
        openCL_prec = self.platform_precision
    
        point_list = self._p    
    
        x,y  = zip(*point_list)
        self.x  = array(x,dtype=self.numpy_prec)
        self.y  = array(y,dtype=self.numpy_prec)
        self.f1 = array([0,0],copy=True,dtype=self.numpy_prec)
        self.f2 = array([0,0],copy=True,dtype=self.numpy_prec)
        self.a  = array([0],copy=True,dtype=self.numpy_prec)
        
        self.num_elements = len(x)
        
        platforms = cl.get_platforms()
        if len(platforms)>1 and not self.platform_name is None:
            # select the the first device in list
            for platform in platforms:
                if self.platform_name in platform.name:
                    # use first device for context
                    for device in platform.get_devices():
                        self.ctx = cl.Context([device])
                        self.device = device
                        break
                    break
        else:
            self.ctx = cl.create_some_context()    
            
        if 'NVIDIA' in self.platform_name:
            compiler_options = '-cl-mad-enable -cl-fast-relaxed-math'
        else:
            compiler_options = ''
        
        #self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)
        
        # define the input buffers and transfer data into them
        mf = cl.mem_flags
        self.x_buf  = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.x)
        self.y_buf  = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.y)
        self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1)
        self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2)
        self.a_buf  = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a)
    
        # define the output buffer
        self.partial_sum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, self.x.nbytes)
        self.full_sum_buf    = cl.Buffer(self.ctx, mf.READ_ONLY, array([0],dtype=self.numpy_prec).nbytes) 

    
        # prep the container to the output
        self.partial_sum = empty(self.x.shape,dtype=self.numpy_prec)
        self.full_sum    = empty(array([0],dtype=self.numpy_prec).shape,dtype=self.numpy_prec)
    
        # define local memory
        self.local_partial_sum = cl.LocalMemory(self.x.nbytes)            
    
        code_pattern= Template("""
            #pragma OPENCL EXTENSION cl_khr_fp64: enable
            __kernel void partial_sum(
            __global const ${precision} *x,
            __global const ${precision} *y, 
            __global const ${precision} *f1,
            __global const ${precision} *f2,
            __global const ${precision} *a,
            ${partial_sum_loc} ${precision} *partial_sum)
            {
              int gid = get_global_id(0);
              
              ${precision} x_f1_diff = x[gid]-f1[0];
              ${precision} x_f2_diff = x[gid]-f2[0];
              ${precision} y_f1_diff = y[gid]-f1[1];
              ${precision} y_f2_diff = y[gid]-f2[1];
              
              ${precision} x_f1_diff_sq = x_f1_diff*x_f1_diff; 
              ${precision} x_f2_diff_sq = x_f2_diff*x_f2_diff;
              ${precision} y_f1_diff_sq = y_f1_diff*y_f1_diff;
              ${precision} y_f2_diff_sq = y_f2_diff*y_f2_diff;
              
              ${precision} norm_pt_to_f1 = sqrt(x_f1_diff_sq+y_f1_diff_sq);
              ${precision} norm_pt_to_f2 = sqrt(x_f2_diff_sq+y_f2_diff_sq);
        
              ${precision} t1 = norm_pt_to_f1+norm_pt_to_f2-2*a[0];
        
              partial_sum[gid]=t1*t1;
            }
            
            __kernel void full_sum(
            ${partial_sum_loc} ${precision} *partial_sum,
            __global ${precision} *full_sum)
            {
              long i;
              ${precision} sum=0;
              
              for(i=0; i<${num_elements}; i++) {
                  sum += partial_sum[i];
                  }
              full_sum[0] = sum;
            }            
            
            __kernel void full_sum2(
            __global ${precision} *partial_sum)
            {
              int gid = get_global_id(0);
              long i;
              long mid_point = (${num_elements}+1)/2;
              if (gid>mid_point) {
                  return;
              } else {
                  partial_sum[gid]+=partial_sum[gid+mid_point];
              }
            }             
        """)
        substitutions = {'precision':openCL_prec,
                         'num_elements':self.num_elements,
                         'partial_sum_loc':self.partial_sum_loc}
        prg = code_pattern.substitute(substitutions)
    
          
    
        # define the function to be executed
        self.prg = cl.Program(self.ctx, prg).build(options=compiler_options)


    def __call__(self,x):
        '''
        Calculate the objective cost in the optimization problem.
        '''
            
        a      = x[0]
        self.a  = array([x[0]],copy=False,dtype=self.numpy_prec)
        self.f1 = array([x[1],x[2]],copy=False,dtype=self.numpy_prec)
        self.f2 = array([x[3],x[4]],copy=False,dtype=self.numpy_prec)
        
        # trigger transfer of values to the GPU buffers
        # only those values which changed before this evaluation need to be updated
        mf = cl.mem_flags        
        self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1)
        self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2)
        self.a_buf  = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a)
      
        # execute the partial_sum function
        if self.partial_sum_loc == '__global':
            self.prg.partial_sum(self.queue,self.x.shape,(8,),
                        self.x_buf,self.y_buf,self.f1_buf,self.f2_buf,self.a_buf,self.partial_sum_buf)
            # trigger the copy of the results into the partial_sum container after completion
            cl.enqueue_read_buffer(self.queue, self.partial_sum_buf, self.partial_sum).wait()
            # execute the full_sum function
            self.prg.full_sum(self.queue,(1,),None,
                            self.partial_sum_buf,self.full_sum_buf)
        else:
            self.prg.partial_sum(self.queue,self.x.shape,(8,),
                        self.x_buf,self.y_buf,self.f1_buf,self.f2_buf,
                        self.a_buf,self.local_partial_sum)
            self.prg.full_sum(self.queue,(1,),None,
                        self.local_partial_sum,self.full_sum_buf)
                        
    
        # trigger the copy of the results into the partial_sum container after completion
        cl.enqueue_read_buffer(self.queue, self.full_sum_buf, self.full_sum).wait()

        # compare sums
        print (sum(self.full_sum)-sum(self.partial_sum))
        print (sum(self.full_sum),sum(self.partial_sum))

        # finish up the calculations 
        n = float(len(point_list))
        _lambda =0.1
        #full_sum = sum(self.partial_sum)
        total = (self.full_sum/float(n) + 
            _lambda*self._ahat_max*self._sigma*exp((a/self._ahat_max)**4))
        
        return total


    def _del_(self):
        pass


if __name__=='__main__':
    import time
    from random import uniform,normalvariate,seed
    # local modules
    import ellipse
    
    ####################################################################

    # setup test conditions
    num_reps = 100    
    num_pts = 256
    precision = 'float'
    seed(1234567890)      # set the random generator to get repeatable results
    
    # setup the reference ellipse
    
    # define the foci locations
    foci1_ref = array([2,-1])
    foci2_ref = array([-2,1])
    # define distance from foci to ellipse circumference
    a_ref = 2.5
    
    point_list = ellipse.generate_point_list(num_pts,a_ref,foci1_ref,foci2_ref)
    
    parameters = { "point_list" : point_list,
                   "platform_name" : 'NVIDIA',
                   "platform_precisions" : precision}
    
    # test the function on an NVIDIA card
    t0 = time.time()
    my_objective = Objective(parameters)  
    x = my_objective.x0

    t1 = time.time()
    for i in range(0,num_reps):
        y  = my_objective(x)    
    t2 = time.time()
    
    print ''
    print 'NVIDIA initialization took %f sec' % (t1-t0)
    print 'NVIDIA execution took %f sec' % (t2-t1)
    print 'Executed %i times on %s' % (num_reps,my_objective.device.name)
    print ''
    
    # test the function on an intel CPU
    
    parameters = { "point_list" : point_list,
                   "platform_name" : 'Intel',
                   "platform_precisions" : precision}
    
    
    t0 = time.time()
    my_objective = Objective(parameters)
    x = my_objective.x0

    t1 = time.time()
    for i in range(0,num_reps):
        y  = my_objective(x)    
    t2 = time.time()    
    
    print ''
    print 'Intel initialization took %f sec' % (t1-t0)
    print 'Intel execution took %f sec' % (t2-t1)
    print 'Executed %i times on %s' % (num_reps,my_objective.device.name)
    print ''

Next Steps:

  • Integrate with the ellipse fit problem
  • Explain the behavior of the code example
  • Measure performance wrt scalar and vectorized solutions
This work is licensed under a Creative Commons Attribution By license.

No comments:

Post a Comment