From 1aafe4a93c44c1ca813df576a9fe64fa9e83e3eb Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Wed, 4 May 2011 09:52:20 -0400 Subject: [PATCH] Some light performance optimizations --- cuburn/code/iter.py | 25 +++++++++++++++---------- cuburn/code/util.py | 6 ++---- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 055fe77..610a246 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -26,6 +26,7 @@ class IterCode(HunkOCode): decls = """ // Note: for normalized lookups, uchar4 actually returns floats texture palTex; +__shared__ iter_info info; """ def _xfbody(self, xfid, xform): @@ -34,8 +35,7 @@ texture palTex; tmpl = Template(""" __device__ -void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, - const iter_info *info, mwc_st *rctx) { +void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, mwc_st *rctx) { float tx, ty, ox = *ix, oy = *iy; {{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}} @@ -65,7 +65,12 @@ void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, __global__ void iter(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { mwc_st rctx = msts[gtid()]; - iter_info *info = &(infos[blockIdx.x]); + iter_info *info_glob = &(infos[blockIdx.x]); + + // load info to shared memory cooperatively + for (int i = threadIdx.x; i * 4 < sizeof(iter_info); i += blockDim.x) + reinterpret_cast(&info)[i] = + reinterpret_cast(info_glob)[i]; int consec_bad = -{{features.fuse}}; // TODO: make nsteps adjustable via genome @@ -82,7 +87,7 @@ void iter(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { {{for xfid, xform in enumerate(features.xforms)}} {{if xfid != features.final_xform_index}} if (xfsel <= {{packer.get('cp.norm_density[%d]' % xfid)}}) { - apply_xf{{xfid}}(&x, &y, &color, info, &rctx); + apply_xf{{xfid}}(&x, &y, &color, &rctx); } else {{endif}} {{endfor}} @@ -92,7 +97,7 @@ void iter(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { } {{if features.final_xform_index}} float fx = x, fy = y, fcolor; - apply_xf{{features.final_xform_index}}(&fx, &fy, &fcolor, info, &rctx); + apply_xf{{features.final_xform_index}}(&fx, &fy, &fcolor, &rctx); {{endif}} if (consec_bad < 0) { @@ -133,9 +138,7 @@ void iter(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { int i = iy * {{features.width}} + ix; - // since info was declared const, C++ barfs unless it's loaded first - float cp_step_frac = {{packer.get('cp_step_frac')}}; - float4 outcol = tex2D(palTex, color, cp_step_frac); + float4 outcol = tex2D(palTex, color, {{packer.get('cp_step_frac')}}); accbuf[i*4] += outcol.x; accbuf[i*4+1] += outcol.y; accbuf[i*4+2] += outcol.z; @@ -157,11 +160,12 @@ def render(features, cps): seeds = mwc.MWC.make_seeds(512 * nsteps) iter = IterCode(features) - code = assemble_code(BaseCode, mwc.MWC, iter, iter.packer, filter.ColorClip) + code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter, filter.ColorClip) for lno, line in enumerate(code.split('\n')): print '%3d %s' % (lno, line) - mod = SourceModule(code, options=['--use_fast_math'], keep=True) + mod = SourceModule(code, keep=True, + options=['-use_fast_math', '-maxrregcount', '32']) cps_as_array = (Genome * len(cps))() for i, cp in enumerate(cps): @@ -205,6 +209,7 @@ def render(features, cps): dbufd = cuda.to_device(dbuf) fun = mod.get_function("iter") + fun.set_cache_config(cuda.func_cache.PREFER_L1) t = fun(InOut(seeds), InOut(infos), abufd, dbufd, block=(512,1,1), grid=(nsteps,1), time_kernel=True) print "Completed render in %g seconds" % t diff --git a/cuburn/code/util.py b/cuburn/code/util.py index 0cfaf3a..ebba41d 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -91,7 +91,7 @@ class DataPackerView(object): name = name.replace('[', '_').replace(']', '') name = self.prefix + name self.packer._access(self, accessor, name) - return '%s->%s' % (self.ptr, name) + return '%s.%s' % (self.ptr, name) def sub(self, dst, src): """Add a substitution to the namespace.""" @@ -118,7 +118,7 @@ class DataPacker(HunkOCode): default_namespace = {'np': np} - def __init__(self, tname, clsize=128): + def __init__(self, tname, clsize=4): """ Create a new DataPacker. @@ -175,7 +175,6 @@ class DataPacker(HunkOCode): def decls(self): tmpl = Template(""" typedef struct { - {{for name, accessor in values}} float {{'%-20s' % name}}; // {{accessor}} {{endfor}} @@ -183,7 +182,6 @@ typedef struct { // Align to fill whole cache lines float padding[{{padding}}]; {{endif}} - } {{tname}}; """) return tmpl.substitute(