mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-04 02:10:45 -05:00 
			
		
		
		
	Lots-o-stuff.
This commit is contained in:
		
							
								
								
									
										14
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								TODO
									
									
									
									
									
								
							@ -2,17 +2,7 @@ Status: passes rudimentary tests
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Current goals:
 | 
					Current goals:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Draw some dang points!
 | 
					 | 
				
			||||||
    - Allocate buffer (can it be pre-allocated?)
 | 
					 | 
				
			||||||
    - Direct scatter linear points by GTID from flame number
 | 
					 | 
				
			||||||
    - Re-enable preview window
 | 
					 | 
				
			||||||
    - Execute frame, update texture, repeat
 | 
					 | 
				
			||||||
- Writeback of points to the buffer
 | 
					 | 
				
			||||||
    - Define writeback class, args
 | 
					 | 
				
			||||||
    - Do camera rotation across frameset
 | 
					 | 
				
			||||||
    - Postpone other kinds of testing and address clamping for now
 | 
					 | 
				
			||||||
- Start xforms
 | 
					- Start xforms
 | 
				
			||||||
    - At first, fixed Sierpinski triangle or something
 | 
					 | 
				
			||||||
    - xform selection, pre- and post-transform in xform
 | 
					    - xform selection, pre- and post-transform in xform
 | 
				
			||||||
    - first of the variations
 | 
					    - first of the variations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -43,16 +33,12 @@ Things to do (rather severely incomplete):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Things to test:
 | 
					Things to test:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- DeviceStream allocator and proper handling of corner cases
 | 
					 | 
				
			||||||
- Debug flag/dict/whatever for entire project in general
 | 
					- Debug flag/dict/whatever for entire project in general
 | 
				
			||||||
    - Iteration counters for IterThread
 | 
					    - Iteration counters for IterThread
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Things to benchmark:
 | 
					Things to benchmark:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Kernel invocation and/or interrupt times (will high load freeze X?)
 | 
					- Kernel invocation and/or interrupt times (will high load freeze X?)
 | 
				
			||||||
- 1D/2D texture load+interpolation speeds vs constant memory loading
 | 
					 | 
				
			||||||
    - Must test under high SFU load
 | 
					 | 
				
			||||||
    - Tex uses separate cache? Has lower bandwidth penalty for gather?
 | 
					 | 
				
			||||||
- MWC float conversion
 | 
					- MWC float conversion
 | 
				
			||||||
- The entire scatter process
 | 
					- The entire scatter process
 | 
				
			||||||
    - Radix sort of writeback coordinates
 | 
					    - Radix sort of writeback coordinates
 | 
				
			||||||
 | 
				
			|||||||
@ -11,7 +11,7 @@ import numpy as np
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from cuburnlib.ptx import *
 | 
					from cuburnlib.ptx import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IterThread(PTXTest):
 | 
					class IterThread(PTXEntryPoint):
 | 
				
			||||||
    entry_name = 'iter_thread'
 | 
					    entry_name = 'iter_thread'
 | 
				
			||||||
    entry_params = []
 | 
					    entry_params = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -19,7 +19,7 @@ class IterThread(PTXTest):
 | 
				
			|||||||
        self.cps_uploaded = False
 | 
					        self.cps_uploaded = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def deps(self):
 | 
					    def deps(self):
 | 
				
			||||||
        return [MWCRNG, CPDataStream]
 | 
					        return [MWCRNG, CPDataStream, HistScatter]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
    def module_setup(self):
 | 
					    def module_setup(self):
 | 
				
			||||||
@ -61,18 +61,19 @@ class IterThread(PTXTest):
 | 
				
			|||||||
        reg.u32('cp_idx cpA')
 | 
					        reg.u32('cp_idx cpA')
 | 
				
			||||||
        with block("Claim a CP"):
 | 
					        with block("Claim a CP"):
 | 
				
			||||||
            std.set_is_first_thread(reg.pred('p_is_first'))
 | 
					            std.set_is_first_thread(reg.pred('p_is_first'))
 | 
				
			||||||
            op.atom.inc.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
 | 
					            op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
 | 
				
			||||||
            op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
 | 
					            op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
 | 
				
			||||||
 | 
					            op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        comment("Load the CP index in all threads")
 | 
					        comment("Load the CP index in all threads")
 | 
				
			||||||
        op.bar.sync(0)
 | 
					        op.bar.sync(1)
 | 
				
			||||||
        op.ld.shared.u32(cp_idx, addr(s_cp_idx))
 | 
					        op.ld.shared.u32(cp_idx, addr(s_cp_idx))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block("Check to see if this CP is valid (if not, we're done"):
 | 
					        with block("Check to see if this CP is valid (if not, we're done)"):
 | 
				
			||||||
            reg.u32('num_cps')
 | 
					            reg.u32('num_cps')
 | 
				
			||||||
            reg.pred('p_last_cp')
 | 
					            reg.pred('p_last_cp')
 | 
				
			||||||
            op.ldu.u32(num_cps, addr(g_num_cps))
 | 
					            op.ldu.u32(num_cps, addr(g_num_cps))
 | 
				
			||||||
            op.setp.ge.u32(p_last_cp, cp_idx, 1)
 | 
					            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
 | 
				
			||||||
            op.bra.uni('all_cps_done', ifp=p_last_cp)
 | 
					            op.bra.uni('all_cps_done', ifp=p_last_cp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block('Load CP address'):
 | 
					        with block('Load CP address'):
 | 
				
			||||||
@ -85,33 +86,37 @@ class IterThread(PTXTest):
 | 
				
			|||||||
        with block("If still fusing, increment count unconditionally"):
 | 
					        with block("If still fusing, increment count unconditionally"):
 | 
				
			||||||
            std.set_is_first_thread(reg.pred('p_is_first'))
 | 
					            std.set_is_first_thread(reg.pred('p_is_first'))
 | 
				
			||||||
            op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
 | 
					            op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
 | 
				
			||||||
            op.bar.sync(0)
 | 
					            op.bar.sync(2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        label('iter_loop_start')
 | 
					        label('iter_loop_start')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        comment('Do... well, most of everything')
 | 
					        comment('Do... well, most of everything')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        mwc.next_f32_11(x_coord)
 | 
				
			||||||
 | 
					        mwc.next_f32_11(y_coord)
 | 
				
			||||||
 | 
					        mwc.next_f32_01(color_coord)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        op.add.u32(num_rounds, num_rounds, 1)
 | 
					        op.add.u32(num_rounds, num_rounds, 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block("Test if we're still in FUSE"):
 | 
					        with block("Test if we're still in FUSE"):
 | 
				
			||||||
            reg.s32('num_samples')
 | 
					            reg.s32('num_samples')
 | 
				
			||||||
            reg.pred('p_in_fuse')
 | 
					            reg.pred('p_in_fuse')
 | 
				
			||||||
            op.ld.shared.u32(num_samples, addr(s_num_samples))
 | 
					            op.ld.shared.s32(num_samples, addr(s_num_samples))
 | 
				
			||||||
            op.setp.lt.s32(p_in_fuse, num_samples, 0)
 | 
					            op.setp.lt.s32(p_in_fuse, num_samples, 0)
 | 
				
			||||||
            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
 | 
					            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block("Ordinarily, we'd write the result here"):
 | 
					        reg.pred('p_point_is_valid')
 | 
				
			||||||
            op.add.u32(num_writes, num_writes, 1)
 | 
					        with block("Write the result"):
 | 
				
			||||||
 | 
					            hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid)
 | 
				
			||||||
        # For testing, declare and clear p_badval
 | 
					            op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
 | 
				
			||||||
        reg.pred('p_goodval')
 | 
					 | 
				
			||||||
        op.setp.eq.u32(p_goodval, 1, 1)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block("Increment number of samples by number of good values"):
 | 
					        with block("Increment number of samples by number of good values"):
 | 
				
			||||||
            reg.b32('good_samples')
 | 
					            reg.b32('good_samples laneid')
 | 
				
			||||||
            op.vote.ballot.b32(good_samples, p_goodval)
 | 
					            reg.pred('p_is_first')
 | 
				
			||||||
 | 
					            op.vote.ballot.b32(good_samples, p_point_is_valid)
 | 
				
			||||||
            op.popc.b32(good_samples, good_samples)
 | 
					            op.popc.b32(good_samples, good_samples)
 | 
				
			||||||
            std.set_is_first_thread(reg.pred('p_is_first'))
 | 
					            op.mov.u32(laneid, '%laneid')
 | 
				
			||||||
 | 
					            op.setp.eq.u32(p_is_first, laneid, 0)
 | 
				
			||||||
            op.red.shared.add.s32(addr(s_num_samples), good_samples,
 | 
					            op.red.shared.add.s32(addr(s_num_samples), good_samples,
 | 
				
			||||||
                                  ifp=p_is_first)
 | 
					                                  ifp=p_is_first)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -138,6 +143,9 @@ class IterThread(PTXTest):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
 | 
					        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
 | 
				
			||||||
        cuda.memset_d32(num_cps_dp, num_cps, 1)
 | 
					        cuda.memset_d32(num_cps_dp, num_cps, 1)
 | 
				
			||||||
 | 
					        # TODO: "if debug >= 3"
 | 
				
			||||||
 | 
					        print "Uploaded stream to card:"
 | 
				
			||||||
 | 
					        CPDataStream.print_record(ctx, cp_stream, 5)
 | 
				
			||||||
        self.cps_uploaded = True
 | 
					        self.cps_uploaded = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @instmethod
 | 
					    @instmethod
 | 
				
			||||||
@ -148,14 +156,228 @@ class IterThread(PTXTest):
 | 
				
			|||||||
        cuda.memset_d32(num_cps_st_dp, 0, 1)
 | 
					        cuda.memset_d32(num_cps_st_dp, 0, 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        func = ctx.mod.get_function('iter_thread')
 | 
					        func = ctx.mod.get_function('iter_thread')
 | 
				
			||||||
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
 | 
					        tr = ctx.ptx.instances[PaletteLookup].texref
 | 
				
			||||||
 | 
					        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True,
 | 
				
			||||||
 | 
					                     texrefs=[tr])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        shape = (ctx.grid[0], ctx.block[0]/32, 32)
 | 
				
			||||||
        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
 | 
					        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
 | 
				
			||||||
        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
 | 
					        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
 | 
				
			||||||
        rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
 | 
					        rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
 | 
				
			||||||
        writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
 | 
					        writes = cuda.from_device(num_writes_dp, shape, np.int32)
 | 
				
			||||||
        print "Rounds:", rounds
 | 
					        print "Rounds:", sum(rounds)
 | 
				
			||||||
        print "Writes:", writes
 | 
					        print "Writes:", sum(writes)
 | 
				
			||||||
 | 
					        print rounds
 | 
				
			||||||
 | 
					        print writes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class CameraTransform(PTXFragment):
 | 
				
			||||||
 | 
					    shortname = 'camera'
 | 
				
			||||||
 | 
					    def deps(self):
 | 
				
			||||||
 | 
					        return [CPDataStream]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def rotate(self, rotated_x, rotated_y, x, y):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Rotate an IFS-space coordinate as defined by the camera.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        if features.camera_rotation:
 | 
				
			||||||
 | 
					            assert rotated_x.name != x.name and rotated_y.name != y.name
 | 
				
			||||||
 | 
					            with block("Rotate %s, %s to camera alignment" % (x, y)):
 | 
				
			||||||
 | 
					                reg.f32('rot_center_x rot_center_y')
 | 
				
			||||||
 | 
					                cp.get_v2(cpA, rot_center_x, 'cp.rot_center[0]',
 | 
				
			||||||
 | 
					                                      rot_center_y, 'cp.rot_center[1]')
 | 
				
			||||||
 | 
					                op.sub.f32(x, x, rot_center_x)
 | 
				
			||||||
 | 
					                op.sub.f32(y, y, rot_center_y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                reg.f32('rot_sin_t rot_cos_t rot_old_x rot_old_y')
 | 
				
			||||||
 | 
					                cp.get_v2(cpA, rot_cos_t,  'cos(cp.rotate * 2 * pi / 360.)',
 | 
				
			||||||
 | 
					                               rot_sin_t, '-sin(cp.rotate * 2 * pi / 360.)')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                comment('rotated_x = x * cos(t) - y * sin(t) + rot_center_x')
 | 
				
			||||||
 | 
					                op.fma.rn.f32(rotated_x, x, rot_cos_t, rot_center_x)
 | 
				
			||||||
 | 
					                op.fma.rn.f32(rotated_x, y, rot_sin_t, rotated_x)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                op.neg.f32(rot_sin_t, rot_sin_t)
 | 
				
			||||||
 | 
					                comment('rotated_y = x * sin(t) + y * cos(t) + rot_center_y')
 | 
				
			||||||
 | 
					                op.fma.rn.f32(rotated_y, x, rot_sin_t, rot_center_y)
 | 
				
			||||||
 | 
					                op.fma.rn.f32(rotated_y, y, rot_cos_t, rotated_y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                # TODO: if this is a register-critical section, reloading
 | 
				
			||||||
 | 
					                # rot_center_[xy] here should save two regs. OTOH, if this is
 | 
				
			||||||
 | 
					                # *not* reg-crit, moving the subtraction above to new variables
 | 
				
			||||||
 | 
					                # may save a few clocks
 | 
				
			||||||
 | 
					                op.add.f32(x, x, rot_center_x)
 | 
				
			||||||
 | 
					                op.add.f32(y, y, rot_center_y)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            comment("No camera rotation in this kernel")
 | 
				
			||||||
 | 
					            op.mov.f32(rotated_x, x)
 | 
				
			||||||
 | 
					            op.mov.f32(rotated_y, y)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def get_norm(self, norm_x, norm_y, x, y):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Find the [0,1]-normalized floating-point histogram coordinates
 | 
				
			||||||
 | 
					        ``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        self.rotate(norm_x, norm_y, x, y)
 | 
				
			||||||
 | 
					        with block("Scale rotated points to [0,1]-normalized coordinates"):
 | 
				
			||||||
 | 
					            reg.f32('cam_scale cam_offset')
 | 
				
			||||||
 | 
					            cp.get_v2(cpA, cam_scale,  'cp.camera.norm_scale[0]',
 | 
				
			||||||
 | 
					                           cam_offset, 'cp.camera.norm_offset[0]')
 | 
				
			||||||
 | 
					            op.fma.f32(norm_x, norm_x, cam_scale, cam_offset)
 | 
				
			||||||
 | 
					            cp.get_v2(cpA, cam_scale,  'cp.camera.norm_scale[1]',
 | 
				
			||||||
 | 
					                           cam_offset, 'cp.camera.norm_offset[1]')
 | 
				
			||||||
 | 
					            op.fma.f32(norm_y, norm_y, cam_scale, cam_offset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def get_index(self, index, x, y, pred=None):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Find the histogram index (as a u32) from the IFS spatial coordinate in
 | 
				
			||||||
 | 
					        ``x, y``.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        If the coordinates are out of bounds, 0xffffffff will be stored to
 | 
				
			||||||
 | 
					        ``index``. If ``pred`` is given, it will be set if the point is valid,
 | 
				
			||||||
 | 
					        and cleared if not.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        # A few instructions could probably be shaved off of this one
 | 
				
			||||||
 | 
					        with block("Find histogram index"):
 | 
				
			||||||
 | 
					            reg.f32('norm_x norm_y')
 | 
				
			||||||
 | 
					            self.rotate(norm_x, norm_y, x, y)
 | 
				
			||||||
 | 
					            comment('Scale and offset from IFS to index coordinates')
 | 
				
			||||||
 | 
					            reg.f32('cam_scale cam_offset')
 | 
				
			||||||
 | 
					            cp.get_v2(cpA, cam_scale,  'cp.camera.idx_scale[0]',
 | 
				
			||||||
 | 
					                           cam_offset, 'cp.camera.idx_offset[0]')
 | 
				
			||||||
 | 
					            op.fma.rn.f32(norm_x, norm_x, cam_scale, cam_offset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            cp.get_v2(cpA, cam_scale,  'cp.camera.idx_scale[1]',
 | 
				
			||||||
 | 
					                           cam_offset, 'cp.camera.idx_offset[1]')
 | 
				
			||||||
 | 
					            op.fma.rn.f32(norm_y, norm_y, cam_scale, cam_offset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            comment('Check for bad value')
 | 
				
			||||||
 | 
					            reg.u32('index_x index_y')
 | 
				
			||||||
 | 
					            if not pred:
 | 
				
			||||||
 | 
					                pred = reg.pred('p_valid')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            op.cvt.rzi.s32.f32(index_x, norm_x)
 | 
				
			||||||
 | 
					            op.setp.ge.s32(pred, index_x, 0)
 | 
				
			||||||
 | 
					            op.setp.lt.and_.s32(pred, index_x, features.hist_width, pred)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            op.cvt.rzi.s32.f32(index_y, norm_y)
 | 
				
			||||||
 | 
					            op.setp.ge.and_.s32(pred, index_y, 0, pred)
 | 
				
			||||||
 | 
					            op.setp.lt.and_.s32(pred, index_y, features.hist_height, pred)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            op.mad.lo.u32(index, index_y, features.hist_stride, index_x)
 | 
				
			||||||
 | 
					            op.mov.u32(index, 0xffffffff, ifnotp=pred)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PaletteLookup(PTXFragment):
 | 
				
			||||||
 | 
					    shortname = "palette"
 | 
				
			||||||
 | 
					    # Resolution of texture on device. Bigger = more palette rez, maybe slower
 | 
				
			||||||
 | 
					    texheight = 16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self):
 | 
				
			||||||
 | 
					        self.texref = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def deps(self):
 | 
				
			||||||
 | 
					        return [CPDataStream]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def module_setup(self):
 | 
				
			||||||
 | 
					        mem.global_.texref('t_palette')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def look_up(self, r, g, b, a, color, norm_time):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Look up the values of ``r, g, b, a`` corresponding to ``color_coord``
 | 
				
			||||||
 | 
					        at the CP indexed in ``timestamp_idx``. Note that both ``color_coord``
 | 
				
			||||||
 | 
					        and ``timestamp_idx`` should be [0,1]-normalized floats.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        op.tex._2d.v4.f32.f32(vec(r, g, b, a),
 | 
				
			||||||
 | 
					                addr([t_palette, ', ',  vec(norm_time, color)]))
 | 
				
			||||||
 | 
					        if features.non_box_temporal_filter:
 | 
				
			||||||
 | 
					            raise NotImplementedError("Non-box temporal filters not supported")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @instmethod
 | 
				
			||||||
 | 
					    def upload_palette(self, ctx, frame, cp_list):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Extract the palette from the given list of interpolated CPs, and upload
 | 
				
			||||||
 | 
					        it to the device as a texture.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        # TODO: figure out if storing the full list is an actual drag on
 | 
				
			||||||
 | 
					        # performance/memory
 | 
				
			||||||
 | 
					        if frame.center_cp.temporal_filter_type != 0:
 | 
				
			||||||
 | 
					            # TODO: make texture sample based on time, not on CP index
 | 
				
			||||||
 | 
					            raise NotImplementedError("Use box temporal filters for now")
 | 
				
			||||||
 | 
					        pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32)
 | 
				
			||||||
 | 
					        inv = float(len(cp_list) - 1) / (self.texheight - 1)
 | 
				
			||||||
 | 
					        for y in range(self.texheight):
 | 
				
			||||||
 | 
					            for x in range(256):
 | 
				
			||||||
 | 
					                for c in range(4):
 | 
				
			||||||
 | 
					                    # TODO: interpolate here?
 | 
				
			||||||
 | 
					                    cy = int(round(y * inv))
 | 
				
			||||||
 | 
					                    pal[y][x][c] = cp_list[cy].palette.entries[x].color[c]
 | 
				
			||||||
 | 
					        dev_array = cuda.make_multichannel_2d_array(pal, "C")
 | 
				
			||||||
 | 
					        self.texref = ctx.mod.get_texref('t_palette')
 | 
				
			||||||
 | 
					        # TODO: float16? or can we still use interp with int storage?
 | 
				
			||||||
 | 
					        self.texref.set_format(cuda.array_format.FLOAT, 4)
 | 
				
			||||||
 | 
					        self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
 | 
				
			||||||
 | 
					        self.texref.set_filter_mode(cuda.filter_mode.LINEAR)
 | 
				
			||||||
 | 
					        self.texref.set_address_mode(0, cuda.address_mode.CLAMP)
 | 
				
			||||||
 | 
					        self.texref.set_address_mode(1, cuda.address_mode.CLAMP)
 | 
				
			||||||
 | 
					        self.texref.set_array(dev_array)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def device_init(self, ctx):
 | 
				
			||||||
 | 
					        assert self.texref, "Must upload palette texture before launch!"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class HistScatter(PTXFragment):
 | 
				
			||||||
 | 
					    shortname = "hist"
 | 
				
			||||||
 | 
					    def deps(self):
 | 
				
			||||||
 | 
					        return [CPDataStream, CameraTransform, PaletteLookup]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def module_setup(self):
 | 
				
			||||||
 | 
					        mem.global_.f32('g_hist_bins',
 | 
				
			||||||
 | 
					                        features.hist_height * features.hist_stride * 4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def entry_setup(self):
 | 
				
			||||||
 | 
					        comment("For now, assume histogram bins have been cleared by host")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def scatter(self, x, y, color, xf_idx, p_valid=None):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Scatter the given point directly to the histogram bins. I think this
 | 
				
			||||||
 | 
					        technique has the worst performance of all of 'em. Accesses ``cpA``
 | 
				
			||||||
 | 
					        directly.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        with block("Scatter directly to buffer"):
 | 
				
			||||||
 | 
					            if p_valid is None:
 | 
				
			||||||
 | 
					                p_valid = reg.pred('p_valid')
 | 
				
			||||||
 | 
					            reg.u32('hist_index')
 | 
				
			||||||
 | 
					            camera.get_index(hist_index, x, y, p_valid)
 | 
				
			||||||
 | 
					            reg.u32('hist_bin_addr')
 | 
				
			||||||
 | 
					            op.mov.u32(hist_bin_addr, g_hist_bins)
 | 
				
			||||||
 | 
					            op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            reg.f32('r g b a norm_time')
 | 
				
			||||||
 | 
					            cp.get(cpA, norm_time, 'cp.norm_time')
 | 
				
			||||||
 | 
					            palette.look_up(r, g, b, a, color, norm_time)
 | 
				
			||||||
 | 
					            # TODO: look up, scale by xform visibility
 | 
				
			||||||
 | 
					            op.red.add.f32(addr(hist_bin_addr), r)
 | 
				
			||||||
 | 
					            op.red.add.f32(addr(hist_bin_addr,4), g)
 | 
				
			||||||
 | 
					            op.red.add.f32(addr(hist_bin_addr,8), b)
 | 
				
			||||||
 | 
					            op.red.add.f32(addr(hist_bin_addr,12), a)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def device_init(self, ctx):
 | 
				
			||||||
 | 
					        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
 | 
				
			||||||
 | 
					        cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @instmethod
 | 
				
			||||||
 | 
					    def get_bins(self, ctx, features):
 | 
				
			||||||
 | 
					        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
 | 
				
			||||||
 | 
					        return cuda.from_device(hist_bins_dp,
 | 
				
			||||||
 | 
					                (features.hist_height, features.hist_stride, 4),
 | 
				
			||||||
 | 
					                dtype=np.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MWCRNG(PTXFragment):
 | 
					class MWCRNG(PTXFragment):
 | 
				
			||||||
    shortname = "mwc"
 | 
					    shortname = "mwc"
 | 
				
			||||||
@ -218,14 +440,15 @@ class MWCRNG(PTXFragment):
 | 
				
			|||||||
        with block('Load random float [0,1] into ' + dst_reg.name):
 | 
					        with block('Load random float [0,1] into ' + dst_reg.name):
 | 
				
			||||||
            self._next()
 | 
					            self._next()
 | 
				
			||||||
            op.cvt.rn.f32.u32(dst_reg, mwc_st)
 | 
					            op.cvt.rn.f32.u32(dst_reg, mwc_st)
 | 
				
			||||||
            op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)
 | 
					            op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
    def next_f32_11(self, dst_reg):
 | 
					    def next_f32_11(self, dst_reg):
 | 
				
			||||||
        with block('Load random float [-1,1) into ' + dst_reg.name):
 | 
					        with block('Load random float [-1,1) into ' + dst_reg.name):
 | 
				
			||||||
 | 
					            reg.u32('mwc_to_float')
 | 
				
			||||||
            self._next()
 | 
					            self._next()
 | 
				
			||||||
            op.cvt.rn.f32.s32(dst_reg, mwc_st)
 | 
					            op.cvt.rn.f32.s32(dst_reg, mwc_st)
 | 
				
			||||||
            op.mul.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)
 | 
					            op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def device_init(self, ctx):
 | 
					    def device_init(self, ctx):
 | 
				
			||||||
        if self.threads_ready >= ctx.threads:
 | 
					        if self.threads_ready >= ctx.threads:
 | 
				
			||||||
 | 
				
			|||||||
@ -14,6 +14,7 @@ import types
 | 
				
			|||||||
import struct
 | 
					import struct
 | 
				
			||||||
from cStringIO import StringIO
 | 
					from cStringIO import StringIO
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple
 | 
				
			||||||
 | 
					from math import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Okay, so here's what's going on.
 | 
					# Okay, so here's what's going on.
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@ -137,7 +138,7 @@ class _Block(object):
 | 
				
			|||||||
        self.stack = [self.outer_ctx]
 | 
					        self.stack = [self.outer_ctx]
 | 
				
			||||||
    def clean_injectors(self):
 | 
					    def clean_injectors(self):
 | 
				
			||||||
        inj = self.stack[-1].injectors
 | 
					        inj = self.stack[-1].injectors
 | 
				
			||||||
        [inj.remove(i) for i in inj if i.dead]
 | 
					        [inj.remove(i) for i in list(inj) if i.dead]
 | 
				
			||||||
    def push_ctx(self):
 | 
					    def push_ctx(self):
 | 
				
			||||||
        self.clean_injectors()
 | 
					        self.clean_injectors()
 | 
				
			||||||
        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
 | 
					        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
 | 
				
			||||||
@ -155,8 +156,6 @@ class _Block(object):
 | 
				
			|||||||
    def pop_ctx(self):
 | 
					    def pop_ctx(self):
 | 
				
			||||||
        self.clean_injectors()
 | 
					        self.clean_injectors()
 | 
				
			||||||
        bs = self.stack.pop()
 | 
					        bs = self.stack.pop()
 | 
				
			||||||
        # TODO: figure out why this next line is needed
 | 
					 | 
				
			||||||
        [bs.injectors.remove(i) for i in bs.injectors if i.dead]
 | 
					 | 
				
			||||||
        self.stack[-1].code.extend(bs.code)
 | 
					        self.stack[-1].code.extend(bs.code)
 | 
				
			||||||
        if len(self.stack) == 1:
 | 
					        if len(self.stack) == 1:
 | 
				
			||||||
            # We're on outer_ctx, so all injectors should be gone.
 | 
					            # We're on outer_ctx, so all injectors should be gone.
 | 
				
			||||||
@ -337,8 +336,8 @@ class _CallChain(object):
 | 
				
			|||||||
        self.__chain = []
 | 
					        self.__chain = []
 | 
				
			||||||
        return r
 | 
					        return r
 | 
				
			||||||
    def __getattr__(self, name):
 | 
					    def __getattr__(self, name):
 | 
				
			||||||
        if name.endswith('_'):
 | 
					        # Work around keword conflicts between python and ptx
 | 
				
			||||||
            name = name[:-1]
 | 
					        name = name.strip('_')
 | 
				
			||||||
        self.__chain.append(name)
 | 
					        self.__chain.append(name)
 | 
				
			||||||
        # Another great crime against the universe:
 | 
					        # Another great crime against the universe:
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
@ -455,20 +454,30 @@ class Mem(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class _MemFactory(_CallChain):
 | 
					class _MemFactory(_CallChain):
 | 
				
			||||||
    """Actual `mem` object"""
 | 
					    """Actual `mem` object"""
 | 
				
			||||||
    def _call(self, type, name, array=False, initializer=None):
 | 
					    def _call(self, type, name, array=False, init=None):
 | 
				
			||||||
        assert len(type) == 2
 | 
					        assert len(type) == 2
 | 
				
			||||||
        memobj = Mem(type, name, array, initializer)
 | 
					        memobj = Mem(type, name, array, init)
 | 
				
			||||||
        if array is True:
 | 
					        if array is True:
 | 
				
			||||||
            array = ['[]']
 | 
					            array = ['[]']
 | 
				
			||||||
        elif array:
 | 
					        elif array:
 | 
				
			||||||
            array = ['[', array, ']']
 | 
					            array = ['[', array, ']']
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            array = []
 | 
					            array = []
 | 
				
			||||||
        if initializer:
 | 
					        if init:
 | 
				
			||||||
            array += [' = ', initializer]
 | 
					            array += [' = ', init]
 | 
				
			||||||
        self.block.code(op=['.%s.%s ' % (type[0], type[1]), name, array])
 | 
					        self.block.code(op=['.%s.%s ' % (type[0], type[1]), name, array])
 | 
				
			||||||
        self.block.inject(name, memobj)
 | 
					        self.block.inject(name, memobj)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # TODO: move vec, addr here, or make this public
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def initializer(*args, **kwargs):
 | 
				
			||||||
 | 
					        if args and kwargs:
 | 
				
			||||||
 | 
					            raise ValueError("Cannot initialize in both list and struct style")
 | 
				
			||||||
 | 
					        if args:
 | 
				
			||||||
 | 
					            return ['{', _softjoin(args, ','), '}']
 | 
				
			||||||
 | 
					        jkws = _softjoin([[k, ' = ', v] for k, v in kwargs.items()], ', ')
 | 
				
			||||||
 | 
					        return ['{', jkws, '}']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Label(object):
 | 
					class Label(object):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Specifies the target for a branch.
 | 
					    Specifies the target for a branch.
 | 
				
			||||||
@ -586,7 +595,7 @@ def instmethod(func):
 | 
				
			|||||||
    """
 | 
					    """
 | 
				
			||||||
    def wrap(cls, ctx, *args, **kwargs):
 | 
					    def wrap(cls, ctx, *args, **kwargs):
 | 
				
			||||||
        inst = ctx.ptx.instances[cls]
 | 
					        inst = ctx.ptx.instances[cls]
 | 
				
			||||||
        func(inst, ctx, *args, **kwargs)
 | 
					        return func(inst, ctx, *args, **kwargs)
 | 
				
			||||||
    return classmethod(wrap)
 | 
					    return classmethod(wrap)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PTXEntryPoint(PTXFragment):
 | 
					class PTXEntryPoint(PTXFragment):
 | 
				
			||||||
@ -979,23 +988,22 @@ class DataStream(PTXFragment):
 | 
				
			|||||||
        assert self.cells[idx].texp is None
 | 
					        assert self.cells[idx].texp is None
 | 
				
			||||||
        offset = self.cells[idx].offset
 | 
					        offset = self.cells[idx].offset
 | 
				
			||||||
        self.cells[idx] = _DataCell(offset, vsize, texp)
 | 
					        self.cells[idx] = _DataCell(offset, vsize, texp)
 | 
				
			||||||
 | 
					        self.free.pop(alloc)
 | 
				
			||||||
        # Now reinsert the fragmented free cells.
 | 
					        # Now reinsert the fragmented free cells.
 | 
				
			||||||
        fragments = alloc - vsize
 | 
					        fragments = alloc - vsize
 | 
				
			||||||
        foffset = offset + vsize
 | 
					        foffset = offset + vsize
 | 
				
			||||||
        fsize = 1
 | 
					        fsize = 1
 | 
				
			||||||
        fidx = idx
 | 
					        fidx = idx
 | 
				
			||||||
        while fsize <= self.alignment:
 | 
					        while fsize < self.alignment:
 | 
				
			||||||
            if fragments & fsize:
 | 
					            if fragments & fsize:
 | 
				
			||||||
                assert fsize not in self.free
 | 
					                assert fsize not in self.free
 | 
				
			||||||
                fidx += 1
 | 
					                fidx += 1
 | 
				
			||||||
                self.cells.insert(fidx, _DataCell(foffset, fsize, None))
 | 
					                self.cells.insert(fidx, _DataCell(foffset, fsize, None))
 | 
				
			||||||
                foffset += fsize
 | 
					                foffset += fsize
 | 
				
			||||||
 | 
					                for k, v in filter(lambda (k, v): v >= fidx, self.free.items()):
 | 
				
			||||||
 | 
					                    self.free[k] = v+1
 | 
				
			||||||
                self.free[fsize] = fidx
 | 
					                self.free[fsize] = fidx
 | 
				
			||||||
            fsize *= 2
 | 
					            fsize *= 2
 | 
				
			||||||
        # Adjust indexes. This is ugly, but evidently unavoidable
 | 
					 | 
				
			||||||
        if fidx-idx:
 | 
					 | 
				
			||||||
            for k, v in filter(lambda (k, v): v > idx, self.free.items()):
 | 
					 | 
				
			||||||
                self.free[k] = v+(fidx-idx)
 | 
					 | 
				
			||||||
        return offset
 | 
					        return offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
@ -1011,7 +1019,7 @@ class DataStream(PTXFragment):
 | 
				
			|||||||
        opname = ['ldu', 'b%d' % (size*8)]
 | 
					        opname = ['ldu', 'b%d' % (size*8)]
 | 
				
			||||||
        if len(dregs) > 1:
 | 
					        if len(dregs) > 1:
 | 
				
			||||||
            opname.insert(1, 'v%d' % len(dregs))
 | 
					            opname.insert(1, 'v%d' % len(dregs))
 | 
				
			||||||
            dregs = vec(dregs)
 | 
					            dregs = vec(*dregs)
 | 
				
			||||||
        op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
 | 
					        op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
@ -1042,6 +1050,8 @@ class DataStream(PTXFragment):
 | 
				
			|||||||
        self.finalized = True
 | 
					        self.finalized = True
 | 
				
			||||||
        for dv in self.size_delayvars:
 | 
					        for dv in self.size_delayvars:
 | 
				
			||||||
            dv.val = self._size
 | 
					            dv.val = self._size
 | 
				
			||||||
 | 
					        print "Finalized stream:"
 | 
				
			||||||
 | 
					        self._print_format()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @instmethod
 | 
					    @instmethod
 | 
				
			||||||
    def pack(self, ctx, _out_file_ = None, **kwargs):
 | 
					    def pack(self, ctx, _out_file_ = None, **kwargs):
 | 
				
			||||||
@ -1087,8 +1097,7 @@ class DataStream(PTXFragment):
 | 
				
			|||||||
                vals = []
 | 
					                vals = []
 | 
				
			||||||
            outfile.write(struct.pack(type, *vals))
 | 
					            outfile.write(struct.pack(type, *vals))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @instmethod
 | 
					    def _print_format(self, ctx=None, stream=None):
 | 
				
			||||||
    def print_record(self, ctx):
 | 
					 | 
				
			||||||
        for cell in self.cells:
 | 
					        for cell in self.cells:
 | 
				
			||||||
            if cell.texp is None:
 | 
					            if cell.texp is None:
 | 
				
			||||||
                print '%3d %2d --' % (cell.offset, cell.size)
 | 
					                print '%3d %2d --' % (cell.offset, cell.size)
 | 
				
			||||||
@ -1096,5 +1105,24 @@ class DataStream(PTXFragment):
 | 
				
			|||||||
            print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
 | 
					            print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
 | 
				
			||||||
                                      cell.texp.exprlist[0])
 | 
					                                      cell.texp.exprlist[0])
 | 
				
			||||||
            for exp in cell.texp.exprlist[1:]:
 | 
					            for exp in cell.texp.exprlist[1:]:
 | 
				
			||||||
                print '%12s %s' % ('', exp)
 | 
					                print '%11s %s' % ('', exp)
 | 
				
			||||||
 | 
					    print_format = instmethod(_print_format)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @instmethod
 | 
				
			||||||
 | 
					    def print_record(self, ctx, stream, limit=None):
 | 
				
			||||||
 | 
					        for i in range(0, len(stream), self._size):
 | 
				
			||||||
 | 
					            for cell in self.cells:
 | 
				
			||||||
 | 
					                if cell.texp is None:
 | 
				
			||||||
 | 
					                    print '%3d %2d --' % (cell.offset, cell.size)
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                print '%3d %2d %4s %s' % (cell.offset, cell.size,
 | 
				
			||||||
 | 
					                        cell.texp.type,
 | 
				
			||||||
 | 
					                        struct.unpack(cell.texp.type,
 | 
				
			||||||
 | 
					                            stream[cell.offset:cell.offset+cell.size]))
 | 
				
			||||||
 | 
					                for exp in cell.texp.exprlist:
 | 
				
			||||||
 | 
					                    print '%11s %s' % ('', exp)
 | 
				
			||||||
 | 
					            print '\n----\n'
 | 
				
			||||||
 | 
					            if limit is not None:
 | 
				
			||||||
 | 
					                limit -= 1
 | 
				
			||||||
 | 
					                if limit <= 0: break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,3 +1,4 @@
 | 
				
			|||||||
 | 
					import math
 | 
				
			||||||
from ctypes import *
 | 
					from ctypes import *
 | 
				
			||||||
from cStringIO import StringIO
 | 
					from cStringIO import StringIO
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
@ -7,53 +8,80 @@ from fr0stlib.pyflam3._flam3 import *
 | 
				
			|||||||
from fr0stlib.pyflam3.constants import *
 | 
					from fr0stlib.pyflam3.constants import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cuburnlib.cuda import LaunchContext
 | 
					from cuburnlib.cuda import LaunchContext
 | 
				
			||||||
from cuburnlib.device_code import IterThread, CPDataStream
 | 
					from cuburnlib.device_code import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Point = lambda x, y: np.array([x, y], dtype=np.double)
 | 
					Point = lambda x, y: np.array([x, y], dtype=np.double)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Genome(pyflam3.Genome):
 | 
					class Genome(pyflam3.Genome):
 | 
				
			||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Frame(pyflam3.Frame):
 | 
					class _Frame(pyflam3.Frame):
 | 
				
			||||||
    def interpolate(self, time, cp):
 | 
					 | 
				
			||||||
        flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def pack_stream(self, ctx, time):
 | 
					 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
        Pack and return the control point data stream to render this frame.
 | 
					    ctypes flam3_frame object used for genome interpolation and
 | 
				
			||||||
 | 
					    spatial filter creation
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
        # Get the central control point, and calculate parameters that change
 | 
					    def __init__(self, genomes, *args, **kwargs):
 | 
				
			||||||
        # once per frame
 | 
					        pyflam3.Frame.__init__(self, *args, **kwargs)
 | 
				
			||||||
        cp = BaseGenome()
 | 
					        self.genomes = (BaseGenome * len(genomes))()
 | 
				
			||||||
        self.interpolate(time, cp)
 | 
					        for i in range(len(genomes)):
 | 
				
			||||||
        self.filt = Filters(self, cp)
 | 
					            memmove(byref(self.genomes[i]), byref(genomes[i]),
 | 
				
			||||||
        rw = cp.spatial_oversample * cp.width  + 2 * self.filt.gutter
 | 
					                    sizeof(BaseGenome))
 | 
				
			||||||
        rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
 | 
					        self.ngenomes = len(genomes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if cp.nbatches * cp.ntemporal_samples < ctx.ctas:
 | 
					        # TODO: do this here?
 | 
				
			||||||
 | 
					        self.pixel_aspect_ratio = float(genomes[0].height) / genomes[0].width
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def interpolate(self, time, stagger=0, cp=None):
 | 
				
			||||||
 | 
					        cp = cp or BaseGenome()
 | 
				
			||||||
 | 
					        flam3_interpolate(self.genomes, self.ngenomes, time,
 | 
				
			||||||
 | 
					                          stagger, byref(cp))
 | 
				
			||||||
 | 
					        return cp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Frame(object):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Handler for a single frame of a rendered genome.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    def __init__(self, _frame, time):
 | 
				
			||||||
 | 
					        self._frame = _frame
 | 
				
			||||||
 | 
					        self.center_cp = self._frame.interpolate(time)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def upload_data(self, ctx, filters, time):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Prepare and upload the data needed to render this frame to the device.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        center = self.center_cp
 | 
				
			||||||
 | 
					        ncps = center.nbatches * center.ntemporal_samples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if ncps < ctx.ctas:
 | 
				
			||||||
            raise NotImplementedError(
 | 
					            raise NotImplementedError(
 | 
				
			||||||
                "Distribution of a CP across multiple CTAs not yet done")
 | 
					                "Distribution of a CP across multiple CTAs not yet done")
 | 
				
			||||||
        # Interpolate each time step, calculate per-step variables, and pack
 | 
					
 | 
				
			||||||
        # into the stream
 | 
					        # TODO: isn't this leaking ctypes xforms all over the place?
 | 
				
			||||||
        stream = StringIO()
 | 
					        stream = StringIO()
 | 
				
			||||||
        print "Data stream contents:"
 | 
					        cp_list = []
 | 
				
			||||||
        CPDataStream.print_record(ctx)
 | 
					 | 
				
			||||||
        tcp = BaseGenome()
 | 
					 | 
				
			||||||
        for batch_idx in range(cp.nbatches):
 | 
					 | 
				
			||||||
            for time_idx in range(cp.ntemporal_samples):
 | 
					 | 
				
			||||||
                idx = time_idx + batch_idx * cp.nbatches
 | 
					 | 
				
			||||||
                cp_time = time + self.filt.temporal_deltas[idx]
 | 
					 | 
				
			||||||
                self.interpolate(time, tcp)
 | 
					 | 
				
			||||||
                tcp.camera = Camera(self, tcp, self.filt)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
                tcp.nsamples = (tcp.camera.sample_density *
 | 
					        for batch_idx in range(center.nbatches):
 | 
				
			||||||
                                cp.width * cp.height) / (
 | 
					            for time_idx in range(center.ntemporal_samples):
 | 
				
			||||||
                                cp.nbatches * cp.ntemporal_samples)
 | 
					                idx = time_idx + batch_idx * center.nbatches
 | 
				
			||||||
 | 
					                time = time + filters.temporal_deltas[idx]
 | 
				
			||||||
 | 
					                cp = self._frame.interpolate(time)
 | 
				
			||||||
 | 
					                cp_list.append(cp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                CPDataStream.pack_into(ctx, stream,
 | 
					                cp.camera = Camera(self._frame, cp, filters)
 | 
				
			||||||
                        frame=self, cp=tcp, cp_idx=idx)
 | 
					                cp.nsamples = (cp.camera.sample_density *
 | 
				
			||||||
 | 
					                               center.width * center.height) / ncps
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print "Expected writes:", (
 | 
				
			||||||
 | 
					                cp.camera.sample_density * center.width * center.height)
 | 
				
			||||||
 | 
					        min_time = min(filters.temporal_deltas)
 | 
				
			||||||
 | 
					        max_time = max(filters.temporal_deltas)
 | 
				
			||||||
 | 
					        for i, cp in enumerate(cp_list):
 | 
				
			||||||
 | 
					            cp.norm_time = (filters.temporal_deltas[i] - min_time) / (
 | 
				
			||||||
 | 
					                            max_time - min_time)
 | 
				
			||||||
 | 
					            CPDataStream.pack_into(ctx, stream, frame=self, cp=cp, cp_idx=idx)
 | 
				
			||||||
 | 
					        PaletteLookup.upload_palette(ctx, self, cp_list)
 | 
				
			||||||
        stream.seek(0)
 | 
					        stream.seek(0)
 | 
				
			||||||
        return (stream.read(), cp.nbatches * cp.ntemporal_samples)
 | 
					        IterThread.upload_cp_stream(ctx, stream.read(), ncps)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Animation(object):
 | 
					class Animation(object):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
@ -74,15 +102,12 @@ class Animation(object):
 | 
				
			|||||||
    interpolated sequence between one or two genomes.
 | 
					    interpolated sequence between one or two genomes.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, genomes):
 | 
					    def __init__(self, genomes):
 | 
				
			||||||
        self.genomes = (Genome * len(genomes))()
 | 
					        # _frame is the ctypes frame object used only for interpolation
 | 
				
			||||||
        for i in range(len(genomes)):
 | 
					        self._frame = _Frame(genomes)
 | 
				
			||||||
            memmove(byref(self.genomes[i]), byref(genomes[i]),
 | 
					 | 
				
			||||||
                    sizeof(BaseGenome))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.features = Features(genomes)
 | 
					        # Use the same set of filters throughout the anim, a la flam3
 | 
				
			||||||
        self.frame = Frame()
 | 
					        self.filters = Filters(self._frame, genomes[0])
 | 
				
			||||||
        self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
 | 
					        self.features = Features(genomes, self.filters)
 | 
				
			||||||
        self.frame.ngenomes = len(genomes)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.ctx = None
 | 
					        self.ctx = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -103,25 +128,17 @@ class Animation(object):
 | 
				
			|||||||
        # TODO: support more nuanced frame control than just 'time'
 | 
					        # TODO: support more nuanced frame control than just 'time'
 | 
				
			||||||
        # TODO: reuse more information between frames
 | 
					        # TODO: reuse more information between frames
 | 
				
			||||||
        # TODO: allow animation-long override of certain parameters (size, etc)
 | 
					        # TODO: allow animation-long override of certain parameters (size, etc)
 | 
				
			||||||
        cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
 | 
					        frame = Frame(self._frame, time)
 | 
				
			||||||
        iter_thread = self.ctx.ptx.instances[IterThread]
 | 
					        frame.upload_data(self.ctx, self.filters, time)
 | 
				
			||||||
        IterThread.upload_cp_stream(self.ctx, cp_stream, num_cps)
 | 
					        self.ctx.set_up()
 | 
				
			||||||
        IterThread.call(self.ctx)
 | 
					        IterThread.call(self.ctx)
 | 
				
			||||||
 | 
					        return HistScatter.get_bins(self.ctx, self.features)
 | 
				
			||||||
class Features(object):
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    Determine features and constants required to render a particular set of
 | 
					 | 
				
			||||||
    genomes. The values of this class are fixed before compilation begins.
 | 
					 | 
				
			||||||
    """
 | 
					 | 
				
			||||||
    # Constant; number of rounds spent fusing points on first CP of a frame
 | 
					 | 
				
			||||||
    num_fuse_samples = 25
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, genomes):
 | 
					 | 
				
			||||||
        self.max_ntemporal_samples = max(
 | 
					 | 
				
			||||||
                [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Filters(object):
 | 
					class Filters(object):
 | 
				
			||||||
    def __init__(self, frame, cp):
 | 
					    def __init__(self, frame, cp):
 | 
				
			||||||
 | 
					        # Use one oversample per filter set, even over multiple timesteps
 | 
				
			||||||
 | 
					        self.oversample = frame.genomes[0].spatial_oversample
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Ugh. I'd really like to replace this mess
 | 
					        # Ugh. I'd really like to replace this mess
 | 
				
			||||||
        spa_filt_ptr = POINTER(c_double)()
 | 
					        spa_filt_ptr = POINTER(c_double)()
 | 
				
			||||||
        spa_width = flam3_create_spatial_filter(byref(frame),
 | 
					        spa_width = flam3_create_spatial_filter(byref(frame),
 | 
				
			||||||
@ -152,7 +169,32 @@ class Filters(object):
 | 
				
			|||||||
        flam3_free(tmp_deltas_ptr)
 | 
					        flam3_free(tmp_deltas_ptr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # TODO: density estimation
 | 
					        # TODO: density estimation
 | 
				
			||||||
        self.gutter = (spa_width - cp.spatial_oversample) / 2
 | 
					        self.gutter = (spa_width - self.oversample) / 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Features(object):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Determine features and constants required to render a particular set of
 | 
				
			||||||
 | 
					    genomes. The values of this class are fixed before compilation begins.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # Constant; number of rounds spent fusing points on first CP of a frame
 | 
				
			||||||
 | 
					    num_fuse_samples = 25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, genomes, flt):
 | 
				
			||||||
 | 
					        any = lambda l: bool(filter(None, map(l, genomes)))
 | 
				
			||||||
 | 
					        self.max_ntemporal_samples = max(
 | 
				
			||||||
 | 
					                [cp.nbatches * cp.ntemporal_samples for cp in genomes])
 | 
				
			||||||
 | 
					        self.camera_rotation = any(lambda cp: cp.rotate)
 | 
				
			||||||
 | 
					        self.non_box_temporal_filter = genomes[0].temporal_filter_type
 | 
				
			||||||
 | 
					        self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Histogram (and log-density copy) width and height
 | 
				
			||||||
 | 
					        self.hist_width  = flt.oversample * genomes[0].width  + 2 * flt.gutter
 | 
				
			||||||
 | 
					        self.hist_height = flt.oversample * genomes[0].height + 2 * flt.gutter
 | 
				
			||||||
 | 
					        # Histogram stride, for better filtering. This code assumes the
 | 
				
			||||||
 | 
					        # 128-byte L1 cache line width of Fermi devices, and a 16-byte
 | 
				
			||||||
 | 
					        # histogram bucket size. TODO: detect these things programmatically,
 | 
				
			||||||
 | 
					        # particularly the histogram bucket size, which may be split soon
 | 
				
			||||||
 | 
					        self.hist_stride = 8 * int(math.ceil(self.hist_width / 8.0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Camera(object):
 | 
					class Camera(object):
 | 
				
			||||||
    """Viewport and exposure."""
 | 
					    """Viewport and exposure."""
 | 
				
			||||||
@ -165,6 +207,7 @@ class Camera(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        center = Point(cp._center[0], cp._center[1])
 | 
					        center = Point(cp._center[0], cp._center[1])
 | 
				
			||||||
        size = Point(cp.width, cp.height)
 | 
					        size = Point(cp.width, cp.height)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # pix per unit, where 'unit' is '1.0' in IFS space
 | 
					        # pix per unit, where 'unit' is '1.0' in IFS space
 | 
				
			||||||
        self.ppu = Point(
 | 
					        self.ppu = Point(
 | 
				
			||||||
            cp.pixels_per_unit * scale / frame.pixel_aspect_ratio,
 | 
					            cp.pixels_per_unit * scale / frame.pixel_aspect_ratio,
 | 
				
			||||||
@ -174,6 +217,8 @@ class Camera(object):
 | 
				
			|||||||
        cornerLL = center - (size / (2 * self.ppu))
 | 
					        cornerLL = center - (size / (2 * self.ppu))
 | 
				
			||||||
        self.lower_bounds = cornerLL - gutter
 | 
					        self.lower_bounds = cornerLL - gutter
 | 
				
			||||||
        self.upper_bounds = cornerLL + (size / self.ppu) + gutter
 | 
					        self.upper_bounds = cornerLL + (size / self.ppu) + gutter
 | 
				
			||||||
        self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
 | 
					        self.norm_scale = 1.0 / (self.upper_bounds - self.lower_bounds)
 | 
				
			||||||
        # TODO: coordinate transforms in concert with GPU (rotation, size)
 | 
					        self.norm_offset = -self.norm_scale * self.lower_bounds
 | 
				
			||||||
 | 
					        self.idx_scale = size * self.norm_scale
 | 
				
			||||||
 | 
					        self.idx_offset = size * self.norm_offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										49
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								main.py
									
									
									
									
									
								
							@ -21,6 +21,14 @@ from fr0stlib.pyflam3 import *
 | 
				
			|||||||
from fr0stlib.pyflam3._flam3 import *
 | 
					from fr0stlib.pyflam3._flam3 import *
 | 
				
			||||||
from cuburnlib.render import *
 | 
					from cuburnlib.render import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pyglet
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def dump_3d(nda):
 | 
				
			||||||
 | 
					    with open('/tmp/data.txt', 'w') as f:
 | 
				
			||||||
 | 
					        for row in nda:
 | 
				
			||||||
 | 
					            f.write('  |  '.join([' '.join(
 | 
				
			||||||
 | 
					                ['%4.1g\t' % x for x in pt]) for pt in row]) + '\n')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main(args):
 | 
					def main(args):
 | 
				
			||||||
    verbose = 1
 | 
					    verbose = 1
 | 
				
			||||||
    if '-d' in args:
 | 
					    if '-d' in args:
 | 
				
			||||||
@ -30,28 +38,37 @@ def main(args):
 | 
				
			|||||||
        genomes = Genome.from_string(fp.read())
 | 
					        genomes = Genome.from_string(fp.read())
 | 
				
			||||||
    anim = Animation(genomes)
 | 
					    anim = Animation(genomes)
 | 
				
			||||||
    anim.compile()
 | 
					    anim.compile()
 | 
				
			||||||
    anim.render_frame()
 | 
					    bins = anim.render_frame()
 | 
				
			||||||
 | 
					    #dump_3d(bins)
 | 
				
			||||||
 | 
					    bins /= ((np.mean(bins)+1e-9)/128.)
 | 
				
			||||||
 | 
					    bins.astype(np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #genome.width, genome.height = 512, 512
 | 
					    if '-g' not in args:
 | 
				
			||||||
    #genome.sample_density = 1000
 | 
					        return
 | 
				
			||||||
    #obuf, stats, frame = genome.render(estimator=3)
 | 
					 | 
				
			||||||
    #gc.collect()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        ##q.put(str(obuf))
 | 
					    print anim.features.hist_width
 | 
				
			||||||
    ##p = Process(target=render, args=(q, genome_path))
 | 
					    print anim.features.hist_height
 | 
				
			||||||
    ##p.start()
 | 
					    print anim.features.hist_stride
 | 
				
			||||||
 | 
					    window = pyglet.window.Window(800, 600)
 | 
				
			||||||
 | 
					    image = pyglet.image.ImageData(anim.features.hist_width,
 | 
				
			||||||
 | 
					                                   anim.features.hist_height,
 | 
				
			||||||
 | 
					                                   'RGBA',
 | 
				
			||||||
 | 
					                                   bins.tostring(),
 | 
				
			||||||
 | 
					                                   anim.features.hist_stride*4)
 | 
				
			||||||
 | 
					    tex = image.texture
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #window = pyglet.window.Window()
 | 
					    @window.event
 | 
				
			||||||
    #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
 | 
					    def on_draw():
 | 
				
			||||||
    #tex = image.texture
 | 
					        window.clear()
 | 
				
			||||||
 | 
					        tex.blit(0, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #@window.event
 | 
					    @window.event
 | 
				
			||||||
    #def on_draw():
 | 
					    def on_key_press(sym, mod):
 | 
				
			||||||
        #window.clear()
 | 
					        if sym == pyglet.window.key.Q:
 | 
				
			||||||
        #tex.blit(0, 0)
 | 
					            pyglet.app.exit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    #pyglet.app.run()
 | 
					    pyglet.app.run()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    if len(sys.argv) < 2 or not os.path.isfile(sys.argv[-1]):
 | 
					    if len(sys.argv) < 2 or not os.path.isfile(sys.argv[-1]):
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user