From ada0fe20c72bc8cc39ace9ba7e17068533c2c002 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Mon, 6 Sep 2010 14:19:06 -0400 Subject: [PATCH] Random floats (I think) --- TODO | 59 ++++++++++++++++++++++++---------------- cuburnlib/device_code.py | 43 ++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/TODO b/TODO index 8ac22ee..ebc2be4 100644 --- a/TODO +++ b/TODO @@ -1,23 +1,23 @@ -Status: currently broken (syntax errors, incomplete sections) +Status: passes rudimentary tests Current goals: -- Test DeviceStream, and get it working. Bugs are expected. - - Test allocator - - Test statement evaluator - - Test packing correctly - - Test that device instructions get injected correctly - - Test in working implementation -- Load a set of genomes and calculate a bare minimum `Feature` set (no xforms, - no filters, no oversample) -- Get frames loaded for rendering -- Get IterThread running in device kernel - - For now, implement as `PTXTest` - - For each frame, loop for FUSE times, then loop through expected number of - points for each CP. Keep a count of number of times looped, and number of - stores that would be done. Verify against expected counts. +- Draw some dang points! + - Allocate buffer (can it be pre-allocated?) + - Direct scatter linear points by GTID from flame number + - Re-enable preview window + - Execute frame, update texture, repeat +- Writeback of points to the buffer + - Define writeback class, args + - Do camera rotation across frameset + - Postpone other kinds of testing and address clamping for now +- Start xforms + - At first, fixed Sierpinski triangle or something + - xform selection, pre- and post-transform in xform + - first of the variations Things to do (rather severely incomplete): + - LaunchContext thread distribution based on generated code register count and shared memory size - qlocal storage @@ -27,9 +27,6 @@ Things to do (rather severely incomplete): - The `Feature` class - Transform count and per-transform code layout - Filter size, oversample, final buffer size -- Palette storage - - Performance implications of different state spaces - - Performance and quality of 2D texture interpolation - Buffer allocation, clearing, reading from device - Preview window - When/how to sample? @@ -41,8 +38,24 @@ Things to do (rather severely incomplete): - Implement - Test effects on quality by masking off writes on all but one lane and boosting the sample density to compensate (muuuuuch later on) -- MWC RNG output types - - float in range [0, 1] -- Debug statements - - Some code can't be tested separately (notably IterThread). Make a debug - flag which embeds extra tests into the kernel +- DE + +Things to test: + +- DeviceStream allocator and proper handling of corner cases +- Debug flag/dict/whatever for entire project in general + - Iteration counters for IterThread + +Things to benchmark: + +- Kernel invocation and/or interrupt times (will high load freeze X?) +- 1D/2D texture load+interpolation speeds vs constant memory loading + - Must test under high SFU load + - Tex uses separate cache? Has lower bandwidth penalty for gather? +- MWC float conversion +- The entire scatter process + - Radix sort of writeback coordinates + - Log-copy-histogram approach + - Direct reductions + - Surface loads, stores, reductions + diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py index 8401fcf..bed0e38 100644 --- a/cuburnlib/device_code.py +++ b/cuburnlib/device_code.py @@ -40,10 +40,10 @@ class IterThread(PTXTest): op.mov.u32(num_writes, 0) # TODO: MWC float output types - #mwc_next_f32_01(x_coord) - #mwc_next_f32_01(y_coord) - #mwc_next_f32_01(color_coord) - #mwc_next_f32_01(alpha_coord) + mwc_next_f32_01(x_coord) + mwc_next_f32_01(y_coord) + mwc_next_f32_01(color_coord) + mwc_next_f32_01(alpha_coord) # Registers are hard to come by. To avoid having to track both the count # of samples processed and the number of samples to generate, @@ -189,17 +189,40 @@ class MWCRNG(PTXFragment): op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr) op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car)) + @ptx_func + def _next(self): + # Call from inside a block! + reg.u64('mwc_out') + op.cvt.u64.u32(mwc_out, mwc_car) + op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out) + op.mov.b64(vec(mwc_st, mwc_car), mwc_out) + @ptx_func def next_b32(self, dst_reg): - with block('Load next random into ' + dst_reg.name): - reg.u64('mwc_out') - op.cvt.u64.u32(mwc_out, mwc_car) - op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out) - op.mov.b64(vec(mwc_st, mwc_car), mwc_out) + with block('Load next random u32 into ' + dst_reg.name): + self._next() op.mov.u32(dst_reg, mwc_st) + @ptx_func + def next_f32_01(self, dst_reg): + # TODO: verify that this is the fastest-performance method + # TODO: verify that this actually does what I think it does + with block('Load random float [0,1] into ' + dst_reg.name): + self._next() + op.cvt.rn.f32.u32(dst_reg, mwc_st) + op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32) + + @ptx_func + def next_f32_11(self, dst_reg): + with block('Load random float [-1,1) into ' + dst_reg.name): + self._next() + op.cvt.rn.f32.s32(dst_reg, mwc_st) + op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31) + def to_inject(self): - return dict(mwc_next_b32=self.next_b32) + return dict(mwc_next_b32=self.next_b32, + mwc_next_f32_01=self.next_f32_01, + mwc_next_f32_11=self.next_f32_11) def device_init(self, ctx): if self.threads_ready >= ctx.threads: