Problem Statement
Can PyOpenCL be used to improve the performance of the ellipse fit problem.
Discussion
The objection function can be recoded using PyOpenCL. This module simplifies experimentation with global and local memory and it simplifies selecting the calculation precision.
To test this get the other code snippets from here. Install all of the modules in a directory, then run this module.
objective_pyopencl.py
'''
This module contains an objective function for the ellipse fitting problem.
The objective is coded using vectorized operations.
'''
__author__ = 'Ed Tate'
__email__ = 'edtate<at>gmail-dot-com'
__website__ = 'exnumerus.blogspot.com'
__license__ = 'Creative Commons Attribute By - http://creativecommons.org/licenses/by/3.0/us/'''
from numpy import *
import pyopencl as cl
from string import Template
# local imports
import objective_scalar
class Objective(objective_scalar.Objective):
def __init__(self,parameters):
objective_scalar.Objective.__init__(self,parameters)
precision_dict = {
'float':float32,
'double':float64}
self.platform_name = parameters.get('platform_name',None)
self.platform_precision = parameters.get('platform_precision','double')
self.partial_sum_loc = parameters.get('partial_sum_loc','__global')
self.numpy_prec = precision_dict[self.platform_precision]
openCL_prec = self.platform_precision
point_list = self._p
x,y = zip(*point_list)
self.x = array(x,dtype=self.numpy_prec)
self.y = array(y,dtype=self.numpy_prec)
self.f1 = array([0,0],copy=True,dtype=self.numpy_prec)
self.f2 = array([0,0],copy=True,dtype=self.numpy_prec)
self.a = array([0],copy=True,dtype=self.numpy_prec)
self.num_elements = len(x)
platforms = cl.get_platforms()
if len(platforms)>1 and not self.platform_name is None:
# select the the first device in list
for platform in platforms:
if self.platform_name in platform.name:
# use first device for context
for device in platform.get_devices():
self.ctx = cl.Context([device])
self.device = device
break
break
else:
self.ctx = cl.create_some_context()
if 'NVIDIA' in self.platform_name:
compiler_options = '-cl-mad-enable -cl-fast-relaxed-math'
else:
compiler_options = ''
#self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx)
# define the input buffers and transfer data into them
mf = cl.mem_flags
self.x_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.x)
self.y_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.y)
self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1)
self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2)
self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a)
# define the output buffer
self.partial_sum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, self.x.nbytes)
self.full_sum_buf = cl.Buffer(self.ctx, mf.READ_ONLY, array([0],dtype=self.numpy_prec).nbytes)
# prep the container to the output
self.partial_sum = empty(self.x.shape,dtype=self.numpy_prec)
self.full_sum = empty(array([0],dtype=self.numpy_prec).shape,dtype=self.numpy_prec)
# define local memory
self.local_partial_sum = cl.LocalMemory(self.x.nbytes)
code_pattern= Template("""
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void partial_sum(
__global const ${precision} *x,
__global const ${precision} *y,
__global const ${precision} *f1,
__global const ${precision} *f2,
__global const ${precision} *a,
${partial_sum_loc} ${precision} *partial_sum)
{
int gid = get_global_id(0);
${precision} x_f1_diff = x[gid]-f1[0];
${precision} x_f2_diff = x[gid]-f2[0];
${precision} y_f1_diff = y[gid]-f1[1];
${precision} y_f2_diff = y[gid]-f2[1];
${precision} x_f1_diff_sq = x_f1_diff*x_f1_diff;
${precision} x_f2_diff_sq = x_f2_diff*x_f2_diff;
${precision} y_f1_diff_sq = y_f1_diff*y_f1_diff;
${precision} y_f2_diff_sq = y_f2_diff*y_f2_diff;
${precision} norm_pt_to_f1 = sqrt(x_f1_diff_sq+y_f1_diff_sq);
${precision} norm_pt_to_f2 = sqrt(x_f2_diff_sq+y_f2_diff_sq);
${precision} t1 = norm_pt_to_f1+norm_pt_to_f2-2*a[0];
partial_sum[gid]=t1*t1;
}
__kernel void full_sum(
${partial_sum_loc} ${precision} *partial_sum,
__global ${precision} *full_sum)
{
long i;
${precision} sum=0;
for(i=0; i<${num_elements}; i++) {
sum += partial_sum[i];
}
full_sum[0] = sum;
}
__kernel void full_sum2(
__global ${precision} *partial_sum)
{
int gid = get_global_id(0);
long i;
long mid_point = (${num_elements}+1)/2;
if (gid>mid_point) {
return;
} else {
partial_sum[gid]+=partial_sum[gid+mid_point];
}
}
""")
substitutions = {'precision':openCL_prec,
'num_elements':self.num_elements,
'partial_sum_loc':self.partial_sum_loc}
prg = code_pattern.substitute(substitutions)
# define the function to be executed
self.prg = cl.Program(self.ctx, prg).build(options=compiler_options)
def __call__(self,x):
'''
Calculate the objective cost in the optimization problem.
'''
a = x[0]
self.a = array([x[0]],copy=False,dtype=self.numpy_prec)
self.f1 = array([x[1],x[2]],copy=False,dtype=self.numpy_prec)
self.f2 = array([x[3],x[4]],copy=False,dtype=self.numpy_prec)
# trigger transfer of values to the GPU buffers
# only those values which changed before this evaluation need to be updated
mf = cl.mem_flags
self.f1_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f1)
self.f2_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.f2)
self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.a)
# execute the partial_sum function
if self.partial_sum_loc == '__global':
self.prg.partial_sum(self.queue,self.x.shape,(8,),
self.x_buf,self.y_buf,self.f1_buf,self.f2_buf,self.a_buf,self.partial_sum_buf)
# trigger the copy of the results into the partial_sum container after completion
cl.enqueue_read_buffer(self.queue, self.partial_sum_buf, self.partial_sum).wait()
# execute the full_sum function
self.prg.full_sum(self.queue,(1,),None,
self.partial_sum_buf,self.full_sum_buf)
else:
self.prg.partial_sum(self.queue,self.x.shape,(8,),
self.x_buf,self.y_buf,self.f1_buf,self.f2_buf,
self.a_buf,self.local_partial_sum)
self.prg.full_sum(self.queue,(1,),None,
self.local_partial_sum,self.full_sum_buf)
# trigger the copy of the results into the partial_sum container after completion
cl.enqueue_read_buffer(self.queue, self.full_sum_buf, self.full_sum).wait()
# compare sums
print (sum(self.full_sum)-sum(self.partial_sum))
print (sum(self.full_sum),sum(self.partial_sum))
# finish up the calculations
n = float(len(point_list))
_lambda =0.1
#full_sum = sum(self.partial_sum)
total = (self.full_sum/float(n) +
_lambda*self._ahat_max*self._sigma*exp((a/self._ahat_max)**4))
return total
def _del_(self):
pass
if __name__=='__main__':
import time
from random import uniform,normalvariate,seed
# local modules
import ellipse
####################################################################
# setup test conditions
num_reps = 100
num_pts = 256
precision = 'float'
seed(1234567890) # set the random generator to get repeatable results
# setup the reference ellipse
# define the foci locations
foci1_ref = array([2,-1])
foci2_ref = array([-2,1])
# define distance from foci to ellipse circumference
a_ref = 2.5
point_list = ellipse.generate_point_list(num_pts,a_ref,foci1_ref,foci2_ref)
parameters = { "point_list" : point_list,
"platform_name" : 'NVIDIA',
"platform_precisions" : precision}
# test the function on an NVIDIA card
t0 = time.time()
my_objective = Objective(parameters)
x = my_objective.x0
t1 = time.time()
for i in range(0,num_reps):
y = my_objective(x)
t2 = time.time()
print ''
print 'NVIDIA initialization took %f sec' % (t1-t0)
print 'NVIDIA execution took %f sec' % (t2-t1)
print 'Executed %i times on %s' % (num_reps,my_objective.device.name)
print ''
# test the function on an intel CPU
parameters = { "point_list" : point_list,
"platform_name" : 'Intel',
"platform_precisions" : precision}
t0 = time.time()
my_objective = Objective(parameters)
x = my_objective.x0
t1 = time.time()
for i in range(0,num_reps):
y = my_objective(x)
t2 = time.time()
print ''
print 'Intel initialization took %f sec' % (t1-t0)
print 'Intel execution took %f sec' % (t2-t1)
print 'Executed %i times on %s' % (num_reps,my_objective.device.name)
print ''
Next Steps:
- Integrate with the ellipse fit problem
- Explain the behavior of the code example
- Measure performance wrt scalar and vectorized solutions
No comments:
Post a Comment