Random floats (I think)

This commit is contained in:
Steven Robertson 2010-09-06 14:19:06 -04:00
parent f3298e0bed
commit ada0fe20c7
2 changed files with 69 additions and 33 deletions

59
TODO
View File

@ -1,23 +1,23 @@
Status: currently broken (syntax errors, incomplete sections) Status: passes rudimentary tests
Current goals: Current goals:
- Test DeviceStream, and get it working. Bugs are expected.
- Test allocator
- Test statement evaluator
- Test packing correctly
- Test that device instructions get injected correctly
- Test in working implementation
- Load a set of genomes and calculate a bare minimum `Feature` set (no xforms,
no filters, no oversample)
- Get frames loaded for rendering
- Get IterThread running in device kernel
- For now, implement as `PTXTest`
- For each frame, loop for FUSE times, then loop through expected number of
points for each CP. Keep a count of number of times looped, and number of
stores that would be done. Verify against expected counts.
- Draw some dang points!
- Allocate buffer (can it be pre-allocated?)
- Direct scatter linear points by GTID from flame number
- Re-enable preview window
- Execute frame, update texture, repeat
- Writeback of points to the buffer
- Define writeback class, args
- Do camera rotation across frameset
- Postpone other kinds of testing and address clamping for now
- Start xforms
- At first, fixed Sierpinski triangle or something
- xform selection, pre- and post-transform in xform
- first of the variations
Things to do (rather severely incomplete): Things to do (rather severely incomplete):
- LaunchContext thread distribution based on generated code register count and - LaunchContext thread distribution based on generated code register count and
shared memory size shared memory size
- qlocal storage - qlocal storage
@ -27,9 +27,6 @@ Things to do (rather severely incomplete):
- The `Feature` class - The `Feature` class
- Transform count and per-transform code layout - Transform count and per-transform code layout
- Filter size, oversample, final buffer size - Filter size, oversample, final buffer size
- Palette storage
- Performance implications of different state spaces
- Performance and quality of 2D texture interpolation
- Buffer allocation, clearing, reading from device - Buffer allocation, clearing, reading from device
- Preview window - Preview window
- When/how to sample? - When/how to sample?
@ -41,8 +38,24 @@ Things to do (rather severely incomplete):
- Implement - Implement
- Test effects on quality by masking off writes on all but one lane and - Test effects on quality by masking off writes on all but one lane and
boosting the sample density to compensate (muuuuuch later on) boosting the sample density to compensate (muuuuuch later on)
- MWC RNG output types - DE
- float in range [0, 1]
- Debug statements Things to test:
- Some code can't be tested separately (notably IterThread). Make a debug
flag which embeds extra tests into the kernel - DeviceStream allocator and proper handling of corner cases
- Debug flag/dict/whatever for entire project in general
- Iteration counters for IterThread
Things to benchmark:
- Kernel invocation and/or interrupt times (will high load freeze X?)
- 1D/2D texture load+interpolation speeds vs constant memory loading
- Must test under high SFU load
- Tex uses separate cache? Has lower bandwidth penalty for gather?
- MWC float conversion
- The entire scatter process
- Radix sort of writeback coordinates
- Log-copy-histogram approach
- Direct reductions
- Surface loads, stores, reductions

View File

@ -40,10 +40,10 @@ class IterThread(PTXTest):
op.mov.u32(num_writes, 0) op.mov.u32(num_writes, 0)
# TODO: MWC float output types # TODO: MWC float output types
#mwc_next_f32_01(x_coord) mwc_next_f32_01(x_coord)
#mwc_next_f32_01(y_coord) mwc_next_f32_01(y_coord)
#mwc_next_f32_01(color_coord) mwc_next_f32_01(color_coord)
#mwc_next_f32_01(alpha_coord) mwc_next_f32_01(alpha_coord)
# Registers are hard to come by. To avoid having to track both the count # Registers are hard to come by. To avoid having to track both the count
# of samples processed and the number of samples to generate, # of samples processed and the number of samples to generate,
@ -189,17 +189,40 @@ class MWCRNG(PTXFragment):
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr) op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car)) op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
@ptx_func
def _next(self):
# Call from inside a block!
reg.u64('mwc_out')
op.cvt.u64.u32(mwc_out, mwc_car)
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
@ptx_func @ptx_func
def next_b32(self, dst_reg): def next_b32(self, dst_reg):
with block('Load next random into ' + dst_reg.name): with block('Load next random u32 into ' + dst_reg.name):
reg.u64('mwc_out') self._next()
op.cvt.u64.u32(mwc_out, mwc_car)
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
op.mov.u32(dst_reg, mwc_st) op.mov.u32(dst_reg, mwc_st)
@ptx_func
def next_f32_01(self, dst_reg):
# TODO: verify that this is the fastest-performance method
# TODO: verify that this actually does what I think it does
with block('Load random float [0,1] into ' + dst_reg.name):
self._next()
op.cvt.rn.f32.u32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)
@ptx_func
def next_f32_11(self, dst_reg):
with block('Load random float [-1,1) into ' + dst_reg.name):
self._next()
op.cvt.rn.f32.s32(dst_reg, mwc_st)
op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)
def to_inject(self): def to_inject(self):
return dict(mwc_next_b32=self.next_b32) return dict(mwc_next_b32=self.next_b32,
mwc_next_f32_01=self.next_f32_01,
mwc_next_f32_11=self.next_f32_11)
def device_init(self, ctx): def device_init(self, ctx):
if self.threads_ready >= ctx.threads: if self.threads_ready >= ctx.threads: