From 1faffa1d1475ff7dcb2602fbcfd0a2517d7f9ac1 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Thu, 27 Oct 2011 10:35:01 -0400 Subject: [PATCH] 'fill_dptr' instead of 'zero_dptr' --- cuburn/code/util.py | 21 ++++++++++++++------- cuburn/render.py | 6 +++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/cuburn/code/util.py b/cuburn/code/util.py index a16163d..5f16437 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -90,10 +90,10 @@ uint32_t trunca(float f) { } __global__ -void zero_dptr(float* dptr, int size) { +void fill_dptr(uint32_t* dptr, int size, uint32_t value) { int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x; if (i < size) { - dptr[i] = 0.0f; + dptr[i] = value; } } @@ -185,14 +185,21 @@ float3 hsv2rgb(float3 hsv) { """ @staticmethod - def zero_dptr(mod, dptr, size, stream=None): + def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)): """ - A memory zeroer which can be embedded in a stream. Size is the - number of 4-byte words in the pointer. + A memory zeroer which can be embedded in a stream, unlike the various + memset routines. Size is the number of 4-byte words in the pointer; + value is the word to fill it with. If value is not an np.uint32, it + will be coerced to a buffer and the first four bytes taken. """ - zero = mod.get_function("zero_dptr") + fill = mod.get_function("fill_dptr") + if not isinstance(value, np.uint32): + if isinstance(value, int): + value = np.uint32(value) + else: + value = np.frombuffer(buffer(value), np.uint32)[0] blocks = int(np.ceil(np.sqrt(size / 1024 + 1))) - zero(dptr, np.int32(size), stream=stream, + fill(dptr, np.int32(size), value, stream=stream, block=(1024, 1, 1), grid=(blocks, blocks, 1)) diff --git a/cuburn/render.py b/cuburn/render.py index 75ffd9b..3d27412 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -166,7 +166,7 @@ class Renderer(object): iter_fun = self.mod.get_function("iter") #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1) - util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) + util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, filt_stream) last_time = times[0][0] @@ -221,9 +221,9 @@ class Renderer(object): yield last_time, self._trim(h_out) last_time = start - util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream) + util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) self._de.invoke(self.mod, cen_cp, d_accum, d_out, filt_stream) - util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) + util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, filt_stream) filter_done_event = cuda.Event().record(filt_stream) f32 = np.float32