mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Random floats (I think)
This commit is contained in:
parent
f3298e0bed
commit
ada0fe20c7
59
TODO
59
TODO
@ -1,23 +1,23 @@
|
|||||||
Status: currently broken (syntax errors, incomplete sections)
|
Status: passes rudimentary tests
|
||||||
|
|
||||||
Current goals:
|
Current goals:
|
||||||
- Test DeviceStream, and get it working. Bugs are expected.
|
|
||||||
- Test allocator
|
|
||||||
- Test statement evaluator
|
|
||||||
- Test packing correctly
|
|
||||||
- Test that device instructions get injected correctly
|
|
||||||
- Test in working implementation
|
|
||||||
- Load a set of genomes and calculate a bare minimum `Feature` set (no xforms,
|
|
||||||
no filters, no oversample)
|
|
||||||
- Get frames loaded for rendering
|
|
||||||
- Get IterThread running in device kernel
|
|
||||||
- For now, implement as `PTXTest`
|
|
||||||
- For each frame, loop for FUSE times, then loop through expected number of
|
|
||||||
points for each CP. Keep a count of number of times looped, and number of
|
|
||||||
stores that would be done. Verify against expected counts.
|
|
||||||
|
|
||||||
|
- Draw some dang points!
|
||||||
|
- Allocate buffer (can it be pre-allocated?)
|
||||||
|
- Direct scatter linear points by GTID from flame number
|
||||||
|
- Re-enable preview window
|
||||||
|
- Execute frame, update texture, repeat
|
||||||
|
- Writeback of points to the buffer
|
||||||
|
- Define writeback class, args
|
||||||
|
- Do camera rotation across frameset
|
||||||
|
- Postpone other kinds of testing and address clamping for now
|
||||||
|
- Start xforms
|
||||||
|
- At first, fixed Sierpinski triangle or something
|
||||||
|
- xform selection, pre- and post-transform in xform
|
||||||
|
- first of the variations
|
||||||
|
|
||||||
Things to do (rather severely incomplete):
|
Things to do (rather severely incomplete):
|
||||||
|
|
||||||
- LaunchContext thread distribution based on generated code register count and
|
- LaunchContext thread distribution based on generated code register count and
|
||||||
shared memory size
|
shared memory size
|
||||||
- qlocal storage
|
- qlocal storage
|
||||||
@ -27,9 +27,6 @@ Things to do (rather severely incomplete):
|
|||||||
- The `Feature` class
|
- The `Feature` class
|
||||||
- Transform count and per-transform code layout
|
- Transform count and per-transform code layout
|
||||||
- Filter size, oversample, final buffer size
|
- Filter size, oversample, final buffer size
|
||||||
- Palette storage
|
|
||||||
- Performance implications of different state spaces
|
|
||||||
- Performance and quality of 2D texture interpolation
|
|
||||||
- Buffer allocation, clearing, reading from device
|
- Buffer allocation, clearing, reading from device
|
||||||
- Preview window
|
- Preview window
|
||||||
- When/how to sample?
|
- When/how to sample?
|
||||||
@ -41,8 +38,24 @@ Things to do (rather severely incomplete):
|
|||||||
- Implement
|
- Implement
|
||||||
- Test effects on quality by masking off writes on all but one lane and
|
- Test effects on quality by masking off writes on all but one lane and
|
||||||
boosting the sample density to compensate (muuuuuch later on)
|
boosting the sample density to compensate (muuuuuch later on)
|
||||||
- MWC RNG output types
|
- DE
|
||||||
- float in range [0, 1]
|
|
||||||
- Debug statements
|
Things to test:
|
||||||
- Some code can't be tested separately (notably IterThread). Make a debug
|
|
||||||
flag which embeds extra tests into the kernel
|
- DeviceStream allocator and proper handling of corner cases
|
||||||
|
- Debug flag/dict/whatever for entire project in general
|
||||||
|
- Iteration counters for IterThread
|
||||||
|
|
||||||
|
Things to benchmark:
|
||||||
|
|
||||||
|
- Kernel invocation and/or interrupt times (will high load freeze X?)
|
||||||
|
- 1D/2D texture load+interpolation speeds vs constant memory loading
|
||||||
|
- Must test under high SFU load
|
||||||
|
- Tex uses separate cache? Has lower bandwidth penalty for gather?
|
||||||
|
- MWC float conversion
|
||||||
|
- The entire scatter process
|
||||||
|
- Radix sort of writeback coordinates
|
||||||
|
- Log-copy-histogram approach
|
||||||
|
- Direct reductions
|
||||||
|
- Surface loads, stores, reductions
|
||||||
|
|
||||||
|
@ -40,10 +40,10 @@ class IterThread(PTXTest):
|
|||||||
op.mov.u32(num_writes, 0)
|
op.mov.u32(num_writes, 0)
|
||||||
|
|
||||||
# TODO: MWC float output types
|
# TODO: MWC float output types
|
||||||
#mwc_next_f32_01(x_coord)
|
mwc_next_f32_01(x_coord)
|
||||||
#mwc_next_f32_01(y_coord)
|
mwc_next_f32_01(y_coord)
|
||||||
#mwc_next_f32_01(color_coord)
|
mwc_next_f32_01(color_coord)
|
||||||
#mwc_next_f32_01(alpha_coord)
|
mwc_next_f32_01(alpha_coord)
|
||||||
|
|
||||||
# Registers are hard to come by. To avoid having to track both the count
|
# Registers are hard to come by. To avoid having to track both the count
|
||||||
# of samples processed and the number of samples to generate,
|
# of samples processed and the number of samples to generate,
|
||||||
@ -190,16 +190,39 @@ class MWCRNG(PTXFragment):
|
|||||||
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
|
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def next_b32(self, dst_reg):
|
def _next(self):
|
||||||
with block('Load next random into ' + dst_reg.name):
|
# Call from inside a block!
|
||||||
reg.u64('mwc_out')
|
reg.u64('mwc_out')
|
||||||
op.cvt.u64.u32(mwc_out, mwc_car)
|
op.cvt.u64.u32(mwc_out, mwc_car)
|
||||||
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
|
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
|
||||||
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
|
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
|
||||||
|
|
||||||
|
@ptx_func
|
||||||
|
def next_b32(self, dst_reg):
|
||||||
|
with block('Load next random u32 into ' + dst_reg.name):
|
||||||
|
self._next()
|
||||||
op.mov.u32(dst_reg, mwc_st)
|
op.mov.u32(dst_reg, mwc_st)
|
||||||
|
|
||||||
|
@ptx_func
|
||||||
|
def next_f32_01(self, dst_reg):
|
||||||
|
# TODO: verify that this is the fastest-performance method
|
||||||
|
# TODO: verify that this actually does what I think it does
|
||||||
|
with block('Load random float [0,1] into ' + dst_reg.name):
|
||||||
|
self._next()
|
||||||
|
op.cvt.rn.f32.u32(dst_reg, mwc_st)
|
||||||
|
op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)
|
||||||
|
|
||||||
|
@ptx_func
|
||||||
|
def next_f32_11(self, dst_reg):
|
||||||
|
with block('Load random float [-1,1) into ' + dst_reg.name):
|
||||||
|
self._next()
|
||||||
|
op.cvt.rn.f32.s32(dst_reg, mwc_st)
|
||||||
|
op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)
|
||||||
|
|
||||||
def to_inject(self):
|
def to_inject(self):
|
||||||
return dict(mwc_next_b32=self.next_b32)
|
return dict(mwc_next_b32=self.next_b32,
|
||||||
|
mwc_next_f32_01=self.next_f32_01,
|
||||||
|
mwc_next_f32_11=self.next_f32_11)
|
||||||
|
|
||||||
def device_init(self, ctx):
|
def device_init(self, ctx):
|
||||||
if self.threads_ready >= ctx.threads:
|
if self.threads_ready >= ctx.threads:
|
||||||
|
Loading…
Reference in New Issue
Block a user