diff --git a/TODO b/TODO index acfdfa0..376a5c2 100644 --- a/TODO +++ b/TODO @@ -2,17 +2,7 @@ Status: passes rudimentary tests Current goals: -- Draw some dang points! - - Allocate buffer (can it be pre-allocated?) - - Direct scatter linear points by GTID from flame number - - Re-enable preview window - - Execute frame, update texture, repeat -- Writeback of points to the buffer - - Define writeback class, args - - Do camera rotation across frameset - - Postpone other kinds of testing and address clamping for now - Start xforms - - At first, fixed Sierpinski triangle or something - xform selection, pre- and post-transform in xform - first of the variations @@ -43,16 +33,12 @@ Things to do (rather severely incomplete): Things to test: -- DeviceStream allocator and proper handling of corner cases - Debug flag/dict/whatever for entire project in general - Iteration counters for IterThread Things to benchmark: - Kernel invocation and/or interrupt times (will high load freeze X?) -- 1D/2D texture load+interpolation speeds vs constant memory loading - - Must test under high SFU load - - Tex uses separate cache? Has lower bandwidth penalty for gather? - MWC float conversion - The entire scatter process - Radix sort of writeback coordinates diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py index 0462d9c..bc978e2 100644 --- a/cuburnlib/device_code.py +++ b/cuburnlib/device_code.py @@ -11,7 +11,7 @@ import numpy as np from cuburnlib.ptx import * -class IterThread(PTXTest): +class IterThread(PTXEntryPoint): entry_name = 'iter_thread' entry_params = [] @@ -19,7 +19,7 @@ class IterThread(PTXTest): self.cps_uploaded = False def deps(self): - return [MWCRNG, CPDataStream] + return [MWCRNG, CPDataStream, HistScatter] @ptx_func def module_setup(self): @@ -61,18 +61,19 @@ class IterThread(PTXTest): reg.u32('cp_idx cpA') with block("Claim a CP"): std.set_is_first_thread(reg.pred('p_is_first')) - op.atom.inc.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first) + op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first) op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first) + op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first) comment("Load the CP index in all threads") - op.bar.sync(0) + op.bar.sync(1) op.ld.shared.u32(cp_idx, addr(s_cp_idx)) - with block("Check to see if this CP is valid (if not, we're done"): + with block("Check to see if this CP is valid (if not, we're done)"): reg.u32('num_cps') reg.pred('p_last_cp') op.ldu.u32(num_cps, addr(g_num_cps)) - op.setp.ge.u32(p_last_cp, cp_idx, 1) + op.setp.ge.u32(p_last_cp, cp_idx, num_cps) op.bra.uni('all_cps_done', ifp=p_last_cp) with block('Load CP address'): @@ -85,33 +86,37 @@ class IterThread(PTXTest): with block("If still fusing, increment count unconditionally"): std.set_is_first_thread(reg.pred('p_is_first')) op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first) - op.bar.sync(0) + op.bar.sync(2) label('iter_loop_start') comment('Do... well, most of everything') + mwc.next_f32_11(x_coord) + mwc.next_f32_11(y_coord) + mwc.next_f32_01(color_coord) + op.add.u32(num_rounds, num_rounds, 1) with block("Test if we're still in FUSE"): reg.s32('num_samples') reg.pred('p_in_fuse') - op.ld.shared.u32(num_samples, addr(s_num_samples)) + op.ld.shared.s32(num_samples, addr(s_num_samples)) op.setp.lt.s32(p_in_fuse, num_samples, 0) op.bra.uni(fuse_loop_start, ifp=p_in_fuse) - with block("Ordinarily, we'd write the result here"): - op.add.u32(num_writes, num_writes, 1) - - # For testing, declare and clear p_badval - reg.pred('p_goodval') - op.setp.eq.u32(p_goodval, 1, 1) + reg.pred('p_point_is_valid') + with block("Write the result"): + hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid) + op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid) with block("Increment number of samples by number of good values"): - reg.b32('good_samples') - op.vote.ballot.b32(good_samples, p_goodval) + reg.b32('good_samples laneid') + reg.pred('p_is_first') + op.vote.ballot.b32(good_samples, p_point_is_valid) op.popc.b32(good_samples, good_samples) - std.set_is_first_thread(reg.pred('p_is_first')) + op.mov.u32(laneid, '%laneid') + op.setp.eq.u32(p_is_first, laneid, 0) op.red.shared.add.s32(addr(s_num_samples), good_samples, ifp=p_is_first) @@ -138,6 +143,9 @@ class IterThread(PTXTest): num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps') cuda.memset_d32(num_cps_dp, num_cps, 1) + # TODO: "if debug >= 3" + print "Uploaded stream to card:" + CPDataStream.print_record(ctx, cp_stream, 5) self.cps_uploaded = True @instmethod @@ -148,14 +156,228 @@ class IterThread(PTXTest): cuda.memset_d32(num_cps_st_dp, 0, 1) func = ctx.mod.get_function('iter_thread') - dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True) + tr = ctx.ptx.instances[PaletteLookup].texref + dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True, + texrefs=[tr]) + shape = (ctx.grid[0], ctx.block[0]/32, 32) num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds') num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes') - rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32) - writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32) - print "Rounds:", rounds - print "Writes:", writes + rounds = cuda.from_device(num_rounds_dp, shape, np.int32) + writes = cuda.from_device(num_writes_dp, shape, np.int32) + print "Rounds:", sum(rounds) + print "Writes:", sum(writes) + print rounds + print writes + +class CameraTransform(PTXFragment): + shortname = 'camera' + def deps(self): + return [CPDataStream] + + @ptx_func + def rotate(self, rotated_x, rotated_y, x, y): + """ + Rotate an IFS-space coordinate as defined by the camera. + """ + if features.camera_rotation: + assert rotated_x.name != x.name and rotated_y.name != y.name + with block("Rotate %s, %s to camera alignment" % (x, y)): + reg.f32('rot_center_x rot_center_y') + cp.get_v2(cpA, rot_center_x, 'cp.rot_center[0]', + rot_center_y, 'cp.rot_center[1]') + op.sub.f32(x, x, rot_center_x) + op.sub.f32(y, y, rot_center_y) + + reg.f32('rot_sin_t rot_cos_t rot_old_x rot_old_y') + cp.get_v2(cpA, rot_cos_t, 'cos(cp.rotate * 2 * pi / 360.)', + rot_sin_t, '-sin(cp.rotate * 2 * pi / 360.)') + + comment('rotated_x = x * cos(t) - y * sin(t) + rot_center_x') + op.fma.rn.f32(rotated_x, x, rot_cos_t, rot_center_x) + op.fma.rn.f32(rotated_x, y, rot_sin_t, rotated_x) + + op.neg.f32(rot_sin_t, rot_sin_t) + comment('rotated_y = x * sin(t) + y * cos(t) + rot_center_y') + op.fma.rn.f32(rotated_y, x, rot_sin_t, rot_center_y) + op.fma.rn.f32(rotated_y, y, rot_cos_t, rotated_y) + + # TODO: if this is a register-critical section, reloading + # rot_center_[xy] here should save two regs. OTOH, if this is + # *not* reg-crit, moving the subtraction above to new variables + # may save a few clocks + op.add.f32(x, x, rot_center_x) + op.add.f32(y, y, rot_center_y) + else: + comment("No camera rotation in this kernel") + op.mov.f32(rotated_x, x) + op.mov.f32(rotated_y, y) + + @ptx_func + def get_norm(self, norm_x, norm_y, x, y): + """ + Find the [0,1]-normalized floating-point histogram coordinates + ``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``. + """ + self.rotate(norm_x, norm_y, x, y) + with block("Scale rotated points to [0,1]-normalized coordinates"): + reg.f32('cam_scale cam_offset') + cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[0]', + cam_offset, 'cp.camera.norm_offset[0]') + op.fma.f32(norm_x, norm_x, cam_scale, cam_offset) + cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[1]', + cam_offset, 'cp.camera.norm_offset[1]') + op.fma.f32(norm_y, norm_y, cam_scale, cam_offset) + + @ptx_func + def get_index(self, index, x, y, pred=None): + """ + Find the histogram index (as a u32) from the IFS spatial coordinate in + ``x, y``. + + If the coordinates are out of bounds, 0xffffffff will be stored to + ``index``. If ``pred`` is given, it will be set if the point is valid, + and cleared if not. + """ + # A few instructions could probably be shaved off of this one + with block("Find histogram index"): + reg.f32('norm_x norm_y') + self.rotate(norm_x, norm_y, x, y) + comment('Scale and offset from IFS to index coordinates') + reg.f32('cam_scale cam_offset') + cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[0]', + cam_offset, 'cp.camera.idx_offset[0]') + op.fma.rn.f32(norm_x, norm_x, cam_scale, cam_offset) + + cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[1]', + cam_offset, 'cp.camera.idx_offset[1]') + op.fma.rn.f32(norm_y, norm_y, cam_scale, cam_offset) + + comment('Check for bad value') + reg.u32('index_x index_y') + if not pred: + pred = reg.pred('p_valid') + + op.cvt.rzi.s32.f32(index_x, norm_x) + op.setp.ge.s32(pred, index_x, 0) + op.setp.lt.and_.s32(pred, index_x, features.hist_width, pred) + + op.cvt.rzi.s32.f32(index_y, norm_y) + op.setp.ge.and_.s32(pred, index_y, 0, pred) + op.setp.lt.and_.s32(pred, index_y, features.hist_height, pred) + + op.mad.lo.u32(index, index_y, features.hist_stride, index_x) + op.mov.u32(index, 0xffffffff, ifnotp=pred) + +class PaletteLookup(PTXFragment): + shortname = "palette" + # Resolution of texture on device. Bigger = more palette rez, maybe slower + texheight = 16 + + def __init__(self): + self.texref = None + + def deps(self): + return [CPDataStream] + + @ptx_func + def module_setup(self): + mem.global_.texref('t_palette') + + @ptx_func + def look_up(self, r, g, b, a, color, norm_time): + """ + Look up the values of ``r, g, b, a`` corresponding to ``color_coord`` + at the CP indexed in ``timestamp_idx``. Note that both ``color_coord`` + and ``timestamp_idx`` should be [0,1]-normalized floats. + """ + op.tex._2d.v4.f32.f32(vec(r, g, b, a), + addr([t_palette, ', ', vec(norm_time, color)])) + if features.non_box_temporal_filter: + raise NotImplementedError("Non-box temporal filters not supported") + + @instmethod + def upload_palette(self, ctx, frame, cp_list): + """ + Extract the palette from the given list of interpolated CPs, and upload + it to the device as a texture. + """ + # TODO: figure out if storing the full list is an actual drag on + # performance/memory + if frame.center_cp.temporal_filter_type != 0: + # TODO: make texture sample based on time, not on CP index + raise NotImplementedError("Use box temporal filters for now") + pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32) + inv = float(len(cp_list) - 1) / (self.texheight - 1) + for y in range(self.texheight): + for x in range(256): + for c in range(4): + # TODO: interpolate here? + cy = int(round(y * inv)) + pal[y][x][c] = cp_list[cy].palette.entries[x].color[c] + dev_array = cuda.make_multichannel_2d_array(pal, "C") + self.texref = ctx.mod.get_texref('t_palette') + # TODO: float16? or can we still use interp with int storage? + self.texref.set_format(cuda.array_format.FLOAT, 4) + self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) + self.texref.set_filter_mode(cuda.filter_mode.LINEAR) + self.texref.set_address_mode(0, cuda.address_mode.CLAMP) + self.texref.set_address_mode(1, cuda.address_mode.CLAMP) + self.texref.set_array(dev_array) + + def device_init(self, ctx): + assert self.texref, "Must upload palette texture before launch!" + +class HistScatter(PTXFragment): + shortname = "hist" + def deps(self): + return [CPDataStream, CameraTransform, PaletteLookup] + + @ptx_func + def module_setup(self): + mem.global_.f32('g_hist_bins', + features.hist_height * features.hist_stride * 4) + + @ptx_func + def entry_setup(self): + comment("For now, assume histogram bins have been cleared by host") + + @ptx_func + def scatter(self, x, y, color, xf_idx, p_valid=None): + """ + Scatter the given point directly to the histogram bins. I think this + technique has the worst performance of all of 'em. Accesses ``cpA`` + directly. + """ + with block("Scatter directly to buffer"): + if p_valid is None: + p_valid = reg.pred('p_valid') + reg.u32('hist_index') + camera.get_index(hist_index, x, y, p_valid) + reg.u32('hist_bin_addr') + op.mov.u32(hist_bin_addr, g_hist_bins) + op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr) + + reg.f32('r g b a norm_time') + cp.get(cpA, norm_time, 'cp.norm_time') + palette.look_up(r, g, b, a, color, norm_time) + # TODO: look up, scale by xform visibility + op.red.add.f32(addr(hist_bin_addr), r) + op.red.add.f32(addr(hist_bin_addr,4), g) + op.red.add.f32(addr(hist_bin_addr,8), b) + op.red.add.f32(addr(hist_bin_addr,12), a) + + + def device_init(self, ctx): + hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins') + cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4) + + @instmethod + def get_bins(self, ctx, features): + hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins') + return cuda.from_device(hist_bins_dp, + (features.hist_height, features.hist_stride, 4), + dtype=np.float32) class MWCRNG(PTXFragment): shortname = "mwc" @@ -218,14 +440,15 @@ class MWCRNG(PTXFragment): with block('Load random float [0,1] into ' + dst_reg.name): self._next() op.cvt.rn.f32.u32(dst_reg, mwc_st) - op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32) + op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32) @ptx_func def next_f32_11(self, dst_reg): with block('Load random float [-1,1) into ' + dst_reg.name): + reg.u32('mwc_to_float') self._next() op.cvt.rn.f32.s32(dst_reg, mwc_st) - op.mul.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31) + op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31) def device_init(self, ctx): if self.threads_ready >= ctx.threads: diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py index ad6c85e..5a18b2c 100644 --- a/cuburnlib/ptx.py +++ b/cuburnlib/ptx.py @@ -14,6 +14,7 @@ import types import struct from cStringIO import StringIO from collections import namedtuple +from math import * # Okay, so here's what's going on. # @@ -137,7 +138,7 @@ class _Block(object): self.stack = [self.outer_ctx] def clean_injectors(self): inj = self.stack[-1].injectors - [inj.remove(i) for i in inj if i.dead] + [inj.remove(i) for i in list(inj) if i.dead] def push_ctx(self): self.clean_injectors() self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], [])) @@ -155,8 +156,6 @@ class _Block(object): def pop_ctx(self): self.clean_injectors() bs = self.stack.pop() - # TODO: figure out why this next line is needed - [bs.injectors.remove(i) for i in bs.injectors if i.dead] self.stack[-1].code.extend(bs.code) if len(self.stack) == 1: # We're on outer_ctx, so all injectors should be gone. @@ -337,8 +336,8 @@ class _CallChain(object): self.__chain = [] return r def __getattr__(self, name): - if name.endswith('_'): - name = name[:-1] + # Work around keword conflicts between python and ptx + name = name.strip('_') self.__chain.append(name) # Another great crime against the universe: return self @@ -455,20 +454,30 @@ class Mem(object): class _MemFactory(_CallChain): """Actual `mem` object""" - def _call(self, type, name, array=False, initializer=None): + def _call(self, type, name, array=False, init=None): assert len(type) == 2 - memobj = Mem(type, name, array, initializer) + memobj = Mem(type, name, array, init) if array is True: array = ['[]'] elif array: array = ['[', array, ']'] else: array = [] - if initializer: - array += [' = ', initializer] + if init: + array += [' = ', init] self.block.code(op=['.%s.%s ' % (type[0], type[1]), name, array]) self.block.inject(name, memobj) + # TODO: move vec, addr here, or make this public + @staticmethod + def initializer(*args, **kwargs): + if args and kwargs: + raise ValueError("Cannot initialize in both list and struct style") + if args: + return ['{', _softjoin(args, ','), '}'] + jkws = _softjoin([[k, ' = ', v] for k, v in kwargs.items()], ', ') + return ['{', jkws, '}'] + class Label(object): """ Specifies the target for a branch. @@ -586,7 +595,7 @@ def instmethod(func): """ def wrap(cls, ctx, *args, **kwargs): inst = ctx.ptx.instances[cls] - func(inst, ctx, *args, **kwargs) + return func(inst, ctx, *args, **kwargs) return classmethod(wrap) class PTXEntryPoint(PTXFragment): @@ -979,23 +988,22 @@ class DataStream(PTXFragment): assert self.cells[idx].texp is None offset = self.cells[idx].offset self.cells[idx] = _DataCell(offset, vsize, texp) + self.free.pop(alloc) # Now reinsert the fragmented free cells. fragments = alloc - vsize foffset = offset + vsize fsize = 1 fidx = idx - while fsize <= self.alignment: + while fsize < self.alignment: if fragments & fsize: assert fsize not in self.free fidx += 1 self.cells.insert(fidx, _DataCell(foffset, fsize, None)) foffset += fsize + for k, v in filter(lambda (k, v): v >= fidx, self.free.items()): + self.free[k] = v+1 self.free[fsize] = fidx fsize *= 2 - # Adjust indexes. This is ugly, but evidently unavoidable - if fidx-idx: - for k, v in filter(lambda (k, v): v > idx, self.free.items()): - self.free[k] = v+(fidx-idx) return offset @ptx_func @@ -1011,7 +1019,7 @@ class DataStream(PTXFragment): opname = ['ldu', 'b%d' % (size*8)] if len(dregs) > 1: opname.insert(1, 'v%d' % len(dregs)) - dregs = vec(dregs) + dregs = vec(*dregs) op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp) @ptx_func @@ -1042,6 +1050,8 @@ class DataStream(PTXFragment): self.finalized = True for dv in self.size_delayvars: dv.val = self._size + print "Finalized stream:" + self._print_format() @instmethod def pack(self, ctx, _out_file_ = None, **kwargs): @@ -1087,8 +1097,7 @@ class DataStream(PTXFragment): vals = [] outfile.write(struct.pack(type, *vals)) - @instmethod - def print_record(self, ctx): + def _print_format(self, ctx=None, stream=None): for cell in self.cells: if cell.texp is None: print '%3d %2d --' % (cell.offset, cell.size) @@ -1096,5 +1105,24 @@ class DataStream(PTXFragment): print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type, cell.texp.exprlist[0]) for exp in cell.texp.exprlist[1:]: - print '%12s %s' % ('', exp) + print '%11s %s' % ('', exp) + print_format = instmethod(_print_format) + + @instmethod + def print_record(self, ctx, stream, limit=None): + for i in range(0, len(stream), self._size): + for cell in self.cells: + if cell.texp is None: + print '%3d %2d --' % (cell.offset, cell.size) + continue + print '%3d %2d %4s %s' % (cell.offset, cell.size, + cell.texp.type, + struct.unpack(cell.texp.type, + stream[cell.offset:cell.offset+cell.size])) + for exp in cell.texp.exprlist: + print '%11s %s' % ('', exp) + print '\n----\n' + if limit is not None: + limit -= 1 + if limit <= 0: break diff --git a/cuburnlib/render.py b/cuburnlib/render.py index 65e80f8..290129a 100644 --- a/cuburnlib/render.py +++ b/cuburnlib/render.py @@ -1,3 +1,4 @@ +import math from ctypes import * from cStringIO import StringIO import numpy as np @@ -7,53 +8,80 @@ from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3.constants import * from cuburnlib.cuda import LaunchContext -from cuburnlib.device_code import IterThread, CPDataStream +from cuburnlib.device_code import * Point = lambda x, y: np.array([x, y], dtype=np.double) class Genome(pyflam3.Genome): pass -class Frame(pyflam3.Frame): - def interpolate(self, time, cp): - flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp)) +class _Frame(pyflam3.Frame): + """ + ctypes flam3_frame object used for genome interpolation and + spatial filter creation + """ + def __init__(self, genomes, *args, **kwargs): + pyflam3.Frame.__init__(self, *args, **kwargs) + self.genomes = (BaseGenome * len(genomes))() + for i in range(len(genomes)): + memmove(byref(self.genomes[i]), byref(genomes[i]), + sizeof(BaseGenome)) + self.ngenomes = len(genomes) - def pack_stream(self, ctx, time): - """ - Pack and return the control point data stream to render this frame. - """ - # Get the central control point, and calculate parameters that change - # once per frame - cp = BaseGenome() - self.interpolate(time, cp) - self.filt = Filters(self, cp) - rw = cp.spatial_oversample * cp.width + 2 * self.filt.gutter - rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter + # TODO: do this here? + self.pixel_aspect_ratio = float(genomes[0].height) / genomes[0].width - if cp.nbatches * cp.ntemporal_samples < ctx.ctas: + def interpolate(self, time, stagger=0, cp=None): + cp = cp or BaseGenome() + flam3_interpolate(self.genomes, self.ngenomes, time, + stagger, byref(cp)) + return cp + +class Frame(object): + """ + Handler for a single frame of a rendered genome. + """ + def __init__(self, _frame, time): + self._frame = _frame + self.center_cp = self._frame.interpolate(time) + + def upload_data(self, ctx, filters, time): + """ + Prepare and upload the data needed to render this frame to the device. + """ + center = self.center_cp + ncps = center.nbatches * center.ntemporal_samples + + if ncps < ctx.ctas: raise NotImplementedError( "Distribution of a CP across multiple CTAs not yet done") - # Interpolate each time step, calculate per-step variables, and pack - # into the stream + + # TODO: isn't this leaking ctypes xforms all over the place? stream = StringIO() - print "Data stream contents:" - CPDataStream.print_record(ctx) - tcp = BaseGenome() - for batch_idx in range(cp.nbatches): - for time_idx in range(cp.ntemporal_samples): - idx = time_idx + batch_idx * cp.nbatches - cp_time = time + self.filt.temporal_deltas[idx] - self.interpolate(time, tcp) - tcp.camera = Camera(self, tcp, self.filt) + cp_list = [] - tcp.nsamples = (tcp.camera.sample_density * - cp.width * cp.height) / ( - cp.nbatches * cp.ntemporal_samples) + for batch_idx in range(center.nbatches): + for time_idx in range(center.ntemporal_samples): + idx = time_idx + batch_idx * center.nbatches + time = time + filters.temporal_deltas[idx] + cp = self._frame.interpolate(time) + cp_list.append(cp) - CPDataStream.pack_into(ctx, stream, - frame=self, cp=tcp, cp_idx=idx) + cp.camera = Camera(self._frame, cp, filters) + cp.nsamples = (cp.camera.sample_density * + center.width * center.height) / ncps + + print "Expected writes:", ( + cp.camera.sample_density * center.width * center.height) + min_time = min(filters.temporal_deltas) + max_time = max(filters.temporal_deltas) + for i, cp in enumerate(cp_list): + cp.norm_time = (filters.temporal_deltas[i] - min_time) / ( + max_time - min_time) + CPDataStream.pack_into(ctx, stream, frame=self, cp=cp, cp_idx=idx) + PaletteLookup.upload_palette(ctx, self, cp_list) stream.seek(0) - return (stream.read(), cp.nbatches * cp.ntemporal_samples) + IterThread.upload_cp_stream(ctx, stream.read(), ncps) class Animation(object): """ @@ -74,15 +102,12 @@ class Animation(object): interpolated sequence between one or two genomes. """ def __init__(self, genomes): - self.genomes = (Genome * len(genomes))() - for i in range(len(genomes)): - memmove(byref(self.genomes[i]), byref(genomes[i]), - sizeof(BaseGenome)) + # _frame is the ctypes frame object used only for interpolation + self._frame = _Frame(genomes) - self.features = Features(genomes) - self.frame = Frame() - self.frame.genomes = cast(self.genomes, POINTER(BaseGenome)) - self.frame.ngenomes = len(genomes) + # Use the same set of filters throughout the anim, a la flam3 + self.filters = Filters(self._frame, genomes[0]) + self.features = Features(genomes, self.filters) self.ctx = None @@ -103,25 +128,17 @@ class Animation(object): # TODO: support more nuanced frame control than just 'time' # TODO: reuse more information between frames # TODO: allow animation-long override of certain parameters (size, etc) - cp_stream, num_cps = self.frame.pack_stream(self.ctx, time) - iter_thread = self.ctx.ptx.instances[IterThread] - IterThread.upload_cp_stream(self.ctx, cp_stream, num_cps) + frame = Frame(self._frame, time) + frame.upload_data(self.ctx, self.filters, time) + self.ctx.set_up() IterThread.call(self.ctx) - -class Features(object): - """ - Determine features and constants required to render a particular set of - genomes. The values of this class are fixed before compilation begins. - """ - # Constant; number of rounds spent fusing points on first CP of a frame - num_fuse_samples = 25 - - def __init__(self, genomes): - self.max_ntemporal_samples = max( - [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1 + return HistScatter.get_bins(self.ctx, self.features) class Filters(object): def __init__(self, frame, cp): + # Use one oversample per filter set, even over multiple timesteps + self.oversample = frame.genomes[0].spatial_oversample + # Ugh. I'd really like to replace this mess spa_filt_ptr = POINTER(c_double)() spa_width = flam3_create_spatial_filter(byref(frame), @@ -152,7 +169,32 @@ class Filters(object): flam3_free(tmp_deltas_ptr) # TODO: density estimation - self.gutter = (spa_width - cp.spatial_oversample) / 2 + self.gutter = (spa_width - self.oversample) / 2 + +class Features(object): + """ + Determine features and constants required to render a particular set of + genomes. The values of this class are fixed before compilation begins. + """ + # Constant; number of rounds spent fusing points on first CP of a frame + num_fuse_samples = 25 + + def __init__(self, genomes, flt): + any = lambda l: bool(filter(None, map(l, genomes))) + self.max_ntemporal_samples = max( + [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + self.camera_rotation = any(lambda cp: cp.rotate) + self.non_box_temporal_filter = genomes[0].temporal_filter_type + self.palette_mode = genomes[0].palette_mode and "linear" or "nearest" + + # Histogram (and log-density copy) width and height + self.hist_width = flt.oversample * genomes[0].width + 2 * flt.gutter + self.hist_height = flt.oversample * genomes[0].height + 2 * flt.gutter + # Histogram stride, for better filtering. This code assumes the + # 128-byte L1 cache line width of Fermi devices, and a 16-byte + # histogram bucket size. TODO: detect these things programmatically, + # particularly the histogram bucket size, which may be split soon + self.hist_stride = 8 * int(math.ceil(self.hist_width / 8.0)) class Camera(object): """Viewport and exposure.""" @@ -165,6 +207,7 @@ class Camera(object): center = Point(cp._center[0], cp._center[1]) size = Point(cp.width, cp.height) + # pix per unit, where 'unit' is '1.0' in IFS space self.ppu = Point( cp.pixels_per_unit * scale / frame.pixel_aspect_ratio, @@ -174,6 +217,8 @@ class Camera(object): cornerLL = center - (size / (2 * self.ppu)) self.lower_bounds = cornerLL - gutter self.upper_bounds = cornerLL + (size / self.ppu) + gutter - self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds) - # TODO: coordinate transforms in concert with GPU (rotation, size) + self.norm_scale = 1.0 / (self.upper_bounds - self.lower_bounds) + self.norm_offset = -self.norm_scale * self.lower_bounds + self.idx_scale = size * self.norm_scale + self.idx_offset = size * self.norm_offset diff --git a/main.py b/main.py index 4e4c4b8..44b7bed 100644 --- a/main.py +++ b/main.py @@ -21,6 +21,14 @@ from fr0stlib.pyflam3 import * from fr0stlib.pyflam3._flam3 import * from cuburnlib.render import * +import pyglet + +def dump_3d(nda): + with open('/tmp/data.txt', 'w') as f: + for row in nda: + f.write(' | '.join([' '.join( + ['%4.1g\t' % x for x in pt]) for pt in row]) + '\n') + def main(args): verbose = 1 if '-d' in args: @@ -30,28 +38,37 @@ def main(args): genomes = Genome.from_string(fp.read()) anim = Animation(genomes) anim.compile() - anim.render_frame() + bins = anim.render_frame() + #dump_3d(bins) + bins /= ((np.mean(bins)+1e-9)/128.) + bins.astype(np.uint8) - #genome.width, genome.height = 512, 512 - #genome.sample_density = 1000 - #obuf, stats, frame = genome.render(estimator=3) - #gc.collect() + if '-g' not in args: + return - ##q.put(str(obuf)) - ##p = Process(target=render, args=(q, genome_path)) - ##p.start() + print anim.features.hist_width + print anim.features.hist_height + print anim.features.hist_stride + window = pyglet.window.Window(800, 600) + image = pyglet.image.ImageData(anim.features.hist_width, + anim.features.hist_height, + 'RGBA', + bins.tostring(), + anim.features.hist_stride*4) + tex = image.texture - #window = pyglet.window.Window() - #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf) - #tex = image.texture + @window.event + def on_draw(): + window.clear() + tex.blit(0, 0) - #@window.event - #def on_draw(): - #window.clear() - #tex.blit(0, 0) + @window.event + def on_key_press(sym, mod): + if sym == pyglet.window.key.Q: + pyglet.app.exit() - #pyglet.app.run() + pyglet.app.run() if __name__ == "__main__": if len(sys.argv) < 2 or not os.path.isfile(sys.argv[-1]):