Use consts for image size instead of immediates.

This saves us from having to recompile if the frame size changes.
This commit is contained in:
Steven Robertson 2011-12-08 12:07:22 -05:00
parent 084a65c615
commit b73461132c
2 changed files with 13 additions and 2 deletions

View File

@ -129,6 +129,13 @@ texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
__shared__ iter_params params;
__device__ int rb_head, rb_tail, rb_size;
typedef struct {
uint32_t width;
uint32_t height;
uint32_t stride;
} acc_size_t;
__constant__ acc_size_t acc_size;
"""
def _xfbody(self, xfid, xform):
@ -330,14 +337,14 @@ void iter(
uint32_t ix = trunca(cx), iy = trunca(cy);
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}}) {
if (ix >= acc_size.width || iy >= acc_size.height) {
{{if info.acc_mode == 'deferred'}}
*log = 0xffffffff;
{{endif}}
continue;
}
uint32_t i = iy * {{info.acc_stride}} + ix;
uint32_t i = iy * acc_size.stride + ix;
{{if info.acc_mode == 'atomic'}}
float4 outcol = tex2D(palTex, cc, time_frac);

View File

@ -132,6 +132,10 @@ class Renderer(object):
d_accum = cuda.mem_alloc(16 * nbins)
d_out = cuda.mem_alloc(16 * nbins)
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
if info.acc_mode == 'deferred':
# Having a fixed, power-of-two log size makes things much easier
log_size = 64 << 20