Rename "cuburnlib" (stupid) to "cuburn" (stupid but shorter)

--HG--
rename : cuburnlib/__init__.py => cuburn/__init__.py
rename : cuburnlib/cuda.py => cuburn/cuda.py
rename : cuburnlib/device_code.py => cuburn/device_code.py
rename : cuburnlib/ptx.py => cuburn/ptx.py
rename : cuburnlib/render.py => cuburn/render.py
This commit is contained in:
Steven Robertson
2010-09-10 14:48:34 -04:00
parent 4552589b35
commit 36f1c1c056
7 changed files with 10 additions and 10 deletions

0
cuburn/__init__.py Normal file
View File

105
cuburn/cuda.py Normal file
View File

@ -0,0 +1,105 @@
# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()
import pycuda.driver as cuda
import pycuda.tools
import pycuda.gl as cudagl
import pycuda.gl.autoinit
import numpy as np
from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
class LaunchContext(object):
"""
Context collecting the information needed to create, run, and gather the
results of a device computation. This may eventually also include an actual
CUDA context, but for now it just uses the global one.
To create the fastest device code across multiple device families, this
context may decide to iteratively refine the final PTX by regenerating
and recompiling it several times to optimize certain parameters of the
launch, such as the distribution of threads throughout the device.
The properties of this device which are tuned are listed below. Any PTX
fragments which use this information must emit valid PTX for any state
given below, but the PTX is only required to actually run with the final,
fixed values of all tuned parameters below.
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
`threads`: Number of active threads on device as a whole.
`mod`: Final compiled module. Unavailable during assembly.
"""
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
self.entry_types = entries
self.block, self.grid, self.build_tests = block, grid, tests
self.setup_done = False
@property
def threads(self):
return reduce(lambda a, b: a*b, self.block + self.grid)
@property
def ctas(self):
return self.grid[0] * self.grid[1]
@property
def threads_per_cta(self):
return self.block[0] * self.block[1] * self.block[2]
@property
def warps_per_cta(self):
return self.threads_per_cta / 32
def compile(self, verbose=False, **kwargs):
kwargs['ctx'] = self
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
try:
self.mod = cuda.module_from_buffer(self.ptx.source)
except (cuda.CompileError, cuda.RuntimeError), e:
print "Aww, dang, compile error. Here's the source:"
self.ptx.print_source()
raise e
if verbose:
if verbose >= 3:
self.ptx.print_source()
for entry in self.ptx.entries:
func = self.mod.get_function(entry.entry_name)
print "Compiled %s: used %d regs, %d sm, %d local" % (
entry.entry_name, func.num_regs,
func.shared_size_bytes, func.local_size_bytes)
def call_setup(self, entry_inst):
for inst in self.ptx.entry_deps[type(entry_inst)]:
inst.call_setup(self)
def call_teardown(self, entry_inst):
okay = True
for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):
if inst is entry_inst and isinstance(entry_inst, PTXTest):
try:
inst.call_teardown(self)
except PTXTestFailure, e:
print "PTX Test %s failed!" % inst.entry_name, e
okay = False
else:
inst.call_teardown(self)
return okay
def run_tests(self):
if not self.ptx.tests:
print "No tests to run."
return True
all_okay = True
for test in self.ptx.tests:
cuda.Context.synchronize()
if test.call(self):
print "Test %s passed." % test.entry_name
else:
print "Test %s FAILED." % test.entry_name
all_okay = False
return all_okay

557
cuburn/device_code.py Normal file
View File

@ -0,0 +1,557 @@
"""
Contains the PTX fragments which will drive the device.
"""
import os
import time
import struct
import pycuda.driver as cuda
import numpy as np
from cuburn.ptx import *
class IterThread(PTXEntryPoint):
entry_name = 'iter_thread'
entry_params = []
def __init__(self):
self.cps_uploaded = False
def deps(self):
return [MWCRNG, CPDataStream, HistScatter]
@ptx_func
def module_setup(self):
mem.global_.u32('g_cp_array',
cp.stream_size*features.max_ntemporal_samples)
mem.global_.u32('g_num_cps')
mem.global_.u32('g_num_cps_started')
# TODO move into debug statement
mem.global_.u32('g_num_rounds', ctx.threads)
mem.global_.u32('g_num_writes', ctx.threads)
@ptx_func
def entry(self):
# For now, we indulge in the luxury of shared memory.
# Index number of current CP, shared across CTA
mem.shared.u32('s_cp_idx')
# Number of samples that have been generated so far in this CTA
# If this number is negative, we're still fusing points, so this
# behaves slightly differently (see ``fuse_loop_start``)
mem.shared.u32('s_num_samples')
op.st.shared.u32(addr(s_num_samples), -(features.num_fuse_samples+1))
# TODO: temporary, for testing
reg.u32('num_rounds num_writes')
op.mov.u32(num_rounds, 0)
op.mov.u32(num_writes, 0)
reg.f32('x_coord y_coord color_coord')
mwc.next_f32_11(x_coord)
mwc.next_f32_11(y_coord)
mwc.next_f32_01(color_coord)
comment("Ensure all init is done")
op.bar.sync(0)
label('cp_loop_start')
reg.u32('cp_idx cpA')
with block("Claim a CP"):
std.set_is_first_thread(reg.pred('p_is_first'))
op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first)
comment("Load the CP index in all threads")
op.bar.sync(1)
op.ld.shared.u32(cp_idx, addr(s_cp_idx))
with block("Check to see if this CP is valid (if not, we're done)"):
reg.u32('num_cps')
reg.pred('p_last_cp')
op.ldu.u32(num_cps, addr(g_num_cps))
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
op.bra.uni('all_cps_done', ifp=p_last_cp)
with block('Load CP address'):
op.mov.u32(cpA, g_cp_array)
op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)
label('fuse_loop_start')
# When fusing, num_samples holds the (negative) number of iterations
# left across the CP, rather than the number of samples in total.
with block("If still fusing, increment count unconditionally"):
std.set_is_first_thread(reg.pred('p_is_first'))
op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
op.bar.sync(2)
label('iter_loop_start')
comment('Do... well, most of everything')
mwc.next_f32_11(x_coord)
mwc.next_f32_11(y_coord)
mwc.next_f32_01(color_coord)
op.add.u32(num_rounds, num_rounds, 1)
with block("Test if we're still in FUSE"):
reg.s32('num_samples')
reg.pred('p_in_fuse')
op.ld.shared.s32(num_samples, addr(s_num_samples))
op.setp.lt.s32(p_in_fuse, num_samples, 0)
op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
reg.pred('p_point_is_valid')
with block("Write the result"):
hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid)
op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
with block("Increment number of samples by number of good values"):
reg.b32('good_samples laneid')
reg.pred('p_is_first')
op.vote.ballot.b32(good_samples, p_point_is_valid)
op.popc.b32(good_samples, good_samples)
op.mov.u32(laneid, '%laneid')
op.setp.eq.u32(p_is_first, laneid, 0)
op.red.shared.add.s32(addr(s_num_samples), good_samples,
ifp=p_is_first)
with block("Check to see if we're done with this CP"):
reg.pred('p_cp_done')
reg.s32('num_samples num_samples_needed')
op.ld.shared.s32(num_samples, addr(s_num_samples))
cp.get(cpA, num_samples_needed, 'cp.nsamples')
op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
op.bra.uni(cp_loop_start, ifp=p_cp_done)
op.bra.uni(iter_loop_start)
label('all_cps_done')
# TODO this is for testing, move it to a debug statement
std.store_per_thread(g_num_rounds, num_rounds)
std.store_per_thread(g_num_writes, num_writes)
@instmethod
def upload_cp_stream(self, ctx, cp_stream, num_cps):
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
assert len(cp_stream) <= cp_array_l, "Stream too big!"
cuda.memcpy_htod_async(cp_array_dp, cp_stream)
num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
cuda.memset_d32(num_cps_dp, num_cps, 1)
# TODO: "if debug >= 3"
print "Uploaded stream to card:"
CPDataStream.print_record(ctx, cp_stream, 5)
self.cps_uploaded = True
def call_setup(self, ctx):
if not self.cps_uploaded:
raise Error("Cannot call IterThread before uploading CPs")
num_cps_st_dp, num_cps_st_l = ctx.mod.get_global('g_num_cps_started')
cuda.memset_d32(num_cps_st_dp, 0, 1)
def _call(self, ctx, func):
# Get texture reference from the Palette
# TODO: more elegant method than reaching into ctx.ptx?
tr = ctx.ptx.instances[PaletteLookup].texref
super(IterThread, self)._call(ctx, func, texrefs=[tr])
def call_teardown(self, ctx):
shape = (ctx.grid[0], ctx.block[0]/32, 32)
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
writes = cuda.from_device(num_writes_dp, shape, np.int32)
print "Rounds:", sum(rounds)
print "Writes:", sum(writes)
print rounds
print writes
class CameraTransform(PTXFragment):
shortname = 'camera'
def deps(self):
return [CPDataStream]
@ptx_func
def rotate(self, rotated_x, rotated_y, x, y):
"""
Rotate an IFS-space coordinate as defined by the camera.
"""
if features.camera_rotation:
assert rotated_x.name != x.name and rotated_y.name != y.name
with block("Rotate %s, %s to camera alignment" % (x, y)):
reg.f32('rot_center_x rot_center_y')
cp.get_v2(cpA, rot_center_x, 'cp.rot_center[0]',
rot_center_y, 'cp.rot_center[1]')
op.sub.f32(x, x, rot_center_x)
op.sub.f32(y, y, rot_center_y)
reg.f32('rot_sin_t rot_cos_t rot_old_x rot_old_y')
cp.get_v2(cpA, rot_cos_t, 'cos(cp.rotate * 2 * pi / 360.)',
rot_sin_t, '-sin(cp.rotate * 2 * pi / 360.)')
comment('rotated_x = x * cos(t) - y * sin(t) + rot_center_x')
op.fma.rn.f32(rotated_x, x, rot_cos_t, rot_center_x)
op.fma.rn.f32(rotated_x, y, rot_sin_t, rotated_x)
op.neg.f32(rot_sin_t, rot_sin_t)
comment('rotated_y = x * sin(t) + y * cos(t) + rot_center_y')
op.fma.rn.f32(rotated_y, x, rot_sin_t, rot_center_y)
op.fma.rn.f32(rotated_y, y, rot_cos_t, rotated_y)
# TODO: if this is a register-critical section, reloading
# rot_center_[xy] here should save two regs. OTOH, if this is
# *not* reg-crit, moving the subtraction above to new variables
# may save a few clocks
op.add.f32(x, x, rot_center_x)
op.add.f32(y, y, rot_center_y)
else:
comment("No camera rotation in this kernel")
op.mov.f32(rotated_x, x)
op.mov.f32(rotated_y, y)
@ptx_func
def get_norm(self, norm_x, norm_y, x, y):
"""
Find the [0,1]-normalized floating-point histogram coordinates
``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``.
"""
self.rotate(norm_x, norm_y, x, y)
with block("Scale rotated points to [0,1]-normalized coordinates"):
reg.f32('cam_scale cam_offset')
cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[0]',
cam_offset, 'cp.camera.norm_offset[0]')
op.fma.f32(norm_x, norm_x, cam_scale, cam_offset)
cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[1]',
cam_offset, 'cp.camera.norm_offset[1]')
op.fma.f32(norm_y, norm_y, cam_scale, cam_offset)
@ptx_func
def get_index(self, index, x, y, pred=None):
"""
Find the histogram index (as a u32) from the IFS spatial coordinate in
``x, y``.
If the coordinates are out of bounds, 0xffffffff will be stored to
``index``. If ``pred`` is given, it will be set if the point is valid,
and cleared if not.
"""
# A few instructions could probably be shaved off of this one
with block("Find histogram index"):
reg.f32('norm_x norm_y')
self.rotate(norm_x, norm_y, x, y)
comment('Scale and offset from IFS to index coordinates')
reg.f32('cam_scale cam_offset')
cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[0]',
cam_offset, 'cp.camera.idx_offset[0]')
op.fma.rn.f32(norm_x, norm_x, cam_scale, cam_offset)
cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[1]',
cam_offset, 'cp.camera.idx_offset[1]')
op.fma.rn.f32(norm_y, norm_y, cam_scale, cam_offset)
comment('Check for bad value')
reg.u32('index_x index_y')
if not pred:
pred = reg.pred('p_valid')
op.cvt.rzi.s32.f32(index_x, norm_x)
op.setp.ge.s32(pred, index_x, 0)
op.setp.lt.and_.s32(pred, index_x, features.hist_width, pred)
op.cvt.rzi.s32.f32(index_y, norm_y)
op.setp.ge.and_.s32(pred, index_y, 0, pred)
op.setp.lt.and_.s32(pred, index_y, features.hist_height, pred)
op.mad.lo.u32(index, index_y, features.hist_stride, index_x)
op.mov.u32(index, 0xffffffff, ifnotp=pred)
class PaletteLookup(PTXFragment):
shortname = "palette"
# Resolution of texture on device. Bigger = more palette rez, maybe slower
texheight = 16
def __init__(self):
self.texref = None
def deps(self):
return [CPDataStream]
@ptx_func
def module_setup(self):
mem.global_.texref('t_palette')
@ptx_func
def look_up(self, r, g, b, a, color, norm_time):
"""
Look up the values of ``r, g, b, a`` corresponding to ``color_coord``
at the CP indexed in ``timestamp_idx``. Note that both ``color_coord``
and ``timestamp_idx`` should be [0,1]-normalized floats.
"""
op.tex._2d.v4.f32.f32(vec(r, g, b, a),
addr([t_palette, ', ', vec(norm_time, color)]))
if features.non_box_temporal_filter:
raise NotImplementedError("Non-box temporal filters not supported")
@instmethod
def upload_palette(self, ctx, frame, cp_list):
"""
Extract the palette from the given list of interpolated CPs, and upload
it to the device as a texture.
"""
# TODO: figure out if storing the full list is an actual drag on
# performance/memory
if frame.center_cp.temporal_filter_type != 0:
# TODO: make texture sample based on time, not on CP index
raise NotImplementedError("Use box temporal filters for now")
pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32)
inv = float(len(cp_list) - 1) / (self.texheight - 1)
for y in range(self.texheight):
for x in range(256):
for c in range(4):
# TODO: interpolate here?
cy = int(round(y * inv))
pal[y][x][c] = cp_list[cy].palette.entries[x].color[c]
dev_array = cuda.make_multichannel_2d_array(pal, "C")
self.texref = ctx.mod.get_texref('t_palette')
# TODO: float16? or can we still use interp with int storage?
self.texref.set_format(cuda.array_format.FLOAT, 4)
self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
self.texref.set_filter_mode(cuda.filter_mode.LINEAR)
self.texref.set_address_mode(0, cuda.address_mode.CLAMP)
self.texref.set_address_mode(1, cuda.address_mode.CLAMP)
self.texref.set_array(dev_array)
def call_setup(self, ctx):
assert self.texref, "Must upload palette texture before launch!"
class HistScatter(PTXFragment):
shortname = "hist"
def deps(self):
return [CPDataStream, CameraTransform, PaletteLookup]
@ptx_func
def module_setup(self):
mem.global_.f32('g_hist_bins',
features.hist_height * features.hist_stride * 4)
@ptx_func
def entry_setup(self):
comment("For now, assume histogram bins have been cleared by host")
@ptx_func
def scatter(self, x, y, color, xf_idx, p_valid=None):
"""
Scatter the given point directly to the histogram bins. I think this
technique has the worst performance of all of 'em. Accesses ``cpA``
directly.
"""
with block("Scatter directly to buffer"):
if p_valid is None:
p_valid = reg.pred('p_valid')
reg.u32('hist_index')
camera.get_index(hist_index, x, y, p_valid)
reg.u32('hist_bin_addr')
op.mov.u32(hist_bin_addr, g_hist_bins)
op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
reg.f32('r g b a norm_time')
cp.get(cpA, norm_time, 'cp.norm_time')
palette.look_up(r, g, b, a, color, norm_time)
# TODO: look up, scale by xform visibility
op.red.add.f32(addr(hist_bin_addr), r)
op.red.add.f32(addr(hist_bin_addr,4), g)
op.red.add.f32(addr(hist_bin_addr,8), b)
op.red.add.f32(addr(hist_bin_addr,12), a)
def call_setup(self, ctx):
hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4)
@instmethod
def get_bins(self, ctx, features):
hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
return cuda.from_device(hist_bins_dp,
(features.hist_height, features.hist_stride, 4),
dtype=np.float32)
class MWCRNG(PTXFragment):
shortname = "mwc"
def __init__(self):
self.threads_ready = 0
if not os.path.isfile('primes.bin'):
raise EnvironmentError('primes.bin not found')
@ptx_func
def module_setup(self):
mem.global_.u32('mwc_rng_mults', ctx.threads)
mem.global_.u64('mwc_rng_state', ctx.threads)
@ptx_func
def entry_setup(self):
reg.u32('mwc_st mwc_mult mwc_car')
with block('Load MWC multipliers and states'):
reg.u32('mwc_off mwc_addr')
std.get_gtid(mwc_off)
op.mov.u32(mwc_addr, mwc_rng_mults)
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
op.mov.u32(mwc_addr, mwc_rng_state)
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
@ptx_func
def entry_teardown(self):
with block('Save MWC states'):
reg.u32('mwc_off mwc_addr')
std.get_gtid(mwc_off)
op.mov.u32(mwc_addr, mwc_rng_state)
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
@ptx_func
def _next(self):
# Call from inside a block!
reg.u64('mwc_out')
op.cvt.u64.u32(mwc_out, mwc_car)
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
@ptx_func
def next_b32(self, dst_reg):
with block('Load next random u32 into ' + dst_reg.name):
self._next()
op.mov.u32(dst_reg, mwc_st)
@ptx_func
def next_f32_01(self, dst_reg):
# TODO: verify that this is the fastest-performance method
# TODO: verify that this actually does what I think it does
with block('Load random float [0,1] into ' + dst_reg.name):
self._next()
op.cvt.rn.f32.u32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
@ptx_func
def next_f32_11(self, dst_reg):
with block('Load random float [-1,1) into ' + dst_reg.name):
reg.u32('mwc_to_float')
self._next()
op.cvt.rn.f32.s32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
@instmethod
def seed(self, ctx, rand=np.random):
"""
Seed the random number generators with values taken from a
``np.random`` instance.
"""
# Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt)
stream = cuda.Stream()
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.threads*4])
rand.shuffle(mults)
# Copy multipliers and seeds to the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
# Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads),
dtype=np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
cuda.memcpy_htod_async(statedp, states.tostring())
self.threads_ready = ctx.threads
def call_setup(self, ctx):
if self.threads_ready < ctx.threads:
self.seed(ctx)
def tests(self):
return [MWCRNGTest]
class MWCRNGTest(PTXTest):
name = "MWC RNG sum-of-threads"
rounds = 5000
entry_name = 'MWC_RNG_test'
entry_params = ''
def deps(self):
return [MWCRNG]
@ptx_func
def module_setup(self):
mem.global_.u64('mwc_rng_test_sums', ctx.threads)
@ptx_func
def entry(self):
reg.u64('sum addl')
reg.u32('addend')
op.mov.u64(sum, 0)
with block('Sum next %d random numbers' % self.rounds):
reg.u32('loopct')
reg.pred('p')
op.mov.u32(loopct, self.rounds)
label('loopstart')
mwc.next_b32(addend)
op.cvt.u64.u32(addl, addend)
op.add.u64(sum, sum, addl)
op.sub.u32(loopct, loopct, 1)
op.setp.gt.u32(p, loopct, 0)
op.bra.uni(loopstart, ifp=p)
with block('Store sum and state'):
reg.u32('adr offset')
std.get_gtid(offset)
op.mov.u32(adr, mwc_rng_test_sums)
op.mad.lo.u32(adr, offset, 8, adr)
op.st.global_.u64(addr(adr), sum)
def call_setup(self, ctx):
# Get current multipliers and seeds from the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
mults = cuda.from_device(multdp, ctx.threads, np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
sums = np.zeros(ctx.threads, np.uint64)
print "Running %d states forward %d rounds" % (len(mults), self.rounds)
ctime = time.time()
for i in range(self.rounds):
states = fullstates & 0xffffffff
carries = fullstates >> 32
fullstates = mults * states + carries
sums = sums + (fullstates & 0xffffffff)
ctime = time.time() - ctime
print "Done on host, took %g seconds" % ctime
def call_teardown(self, ctx):
dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
if not (dfullstates == fullstates).all():
print "State discrepancy"
print dfullstates
print fullstates
raise PTXTestFailure("MWC RNG state discrepancy")
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
if not (dsums == sums).all():
print "Sum discrepancy"
print dsums
print sums
raise PTXTestFailure("MWC RNG sum discrepancy")
class CPDataStream(DataStream):
"""DataStream which stores the control points."""
shortname = 'cp'

1160
cuburn/ptx.py Normal file

File diff suppressed because it is too large Load Diff

223
cuburn/render.py Normal file
View File

@ -0,0 +1,223 @@
import math
from ctypes import *
from cStringIO import StringIO
import numpy as np
from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *
from cuburn.cuda import LaunchContext
from cuburn.device_code import *
Point = lambda x, y: np.array([x, y], dtype=np.double)
class Genome(pyflam3.Genome):
pass
class _Frame(pyflam3.Frame):
"""
ctypes flam3_frame object used for genome interpolation and
spatial filter creation
"""
def __init__(self, genomes, *args, **kwargs):
pyflam3.Frame.__init__(self, *args, **kwargs)
self.genomes = (BaseGenome * len(genomes))()
for i in range(len(genomes)):
memmove(byref(self.genomes[i]), byref(genomes[i]),
sizeof(BaseGenome))
self.ngenomes = len(genomes)
# TODO: do this here?
self.pixel_aspect_ratio = float(genomes[0].height) / genomes[0].width
def interpolate(self, time, stagger=0, cp=None):
cp = cp or BaseGenome()
flam3_interpolate(self.genomes, self.ngenomes, time,
stagger, byref(cp))
return cp
class Frame(object):
"""
Handler for a single frame of a rendered genome.
"""
def __init__(self, _frame, time):
self._frame = _frame
self.center_cp = self._frame.interpolate(time)
def upload_data(self, ctx, filters, time):
"""
Prepare and upload the data needed to render this frame to the device.
"""
center = self.center_cp
ncps = center.nbatches * center.ntemporal_samples
if ncps < ctx.ctas:
raise NotImplementedError(
"Distribution of a CP across multiple CTAs not yet done")
# TODO: isn't this leaking ctypes xforms all over the place?
stream = StringIO()
cp_list = []
for batch_idx in range(center.nbatches):
for time_idx in range(center.ntemporal_samples):
idx = time_idx + batch_idx * center.nbatches
time = time + filters.temporal_deltas[idx]
cp = self._frame.interpolate(time)
cp_list.append(cp)
cp.camera = Camera(self._frame, cp, filters)
cp.nsamples = (cp.camera.sample_density *
center.width * center.height) / ncps
print "Expected writes:", (
cp.camera.sample_density * center.width * center.height)
min_time = min(filters.temporal_deltas)
max_time = max(filters.temporal_deltas)
for i, cp in enumerate(cp_list):
cp.norm_time = (filters.temporal_deltas[i] - min_time) / (
max_time - min_time)
CPDataStream.pack_into(ctx, stream, frame=self, cp=cp, cp_idx=idx)
PaletteLookup.upload_palette(ctx, self, cp_list)
stream.seek(0)
IterThread.upload_cp_stream(ctx, stream.read(), ncps)
class Animation(object):
"""
Control structure for rendering a series of frames.
Each animation will dynamically generate a kernel that includes only the
code necessary to render the genomes provided. The process of generating
and uploading the kernel takes a small but finite amount of time. In
general, the kernel generated for all genomes resulting from interpolating
between two control points will have identical performance, so it is
wasteful to create more than one animation for any interpolated sequence.
However, genome sequences interpolated from three or more control points
with different features enabled will have the code needed to render all
genomes enabled for every frame. Doing this can hurt performance.
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
"""
def __init__(self, genomes):
# _frame is the ctypes frame object used only for interpolation
self._frame = _Frame(genomes)
# Use the same set of filters throughout the anim, a la flam3
self.filters = Filters(self._frame, genomes[0])
self.features = Features(genomes, self.filters)
self.ctx = None
def compile(self):
"""
Create a PTX kernel optimized for this animation, compile it, and
attach it to a LaunchContext with a thread distribution optimized for
the active device.
"""
# TODO: user-configurable test control
self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
tests=True)
# TODO: user-configurable verbosity control
self.ctx.compile(verbose=3, anim=self, features=self.features)
# TODO: automatic optimization of block parameters
def render_frame(self, time=0):
# TODO: support more nuanced frame control than just 'time'
# TODO: reuse more information between frames
# TODO: allow animation-long override of certain parameters (size, etc)
frame = Frame(self._frame, time)
frame.upload_data(self.ctx, self.filters, time)
IterThread.call(self.ctx)
return HistScatter.get_bins(self.ctx, self.features)
class Filters(object):
def __init__(self, frame, cp):
# Use one oversample per filter set, even over multiple timesteps
self.oversample = frame.genomes[0].spatial_oversample
# Ugh. I'd really like to replace this mess
spa_filt_ptr = POINTER(c_double)()
spa_width = flam3_create_spatial_filter(byref(frame),
flam3_field_both,
byref(spa_filt_ptr))
if spa_width < 0:
raise EnvironmentError("flam3 call failed")
self.spatial = np.asarray([[spa_filt_ptr[y*spa_width+x] for x in
range(spa_width)] for y in range(spa_width)], dtype=np.double)
self.spatial_width = spa_width
flam3_free(spa_filt_ptr)
tmp_filt_ptr = POINTER(c_double)()
tmp_deltas_ptr = POINTER(c_double)()
steps = cp.nbatches * cp.ntemporal_samples
self.temporal_sum = flam3_create_temporal_filter(
steps,
cp.temporal_filter_type,
cp.temporal_filter_exp,
cp.temporal_filter_width,
byref(tmp_filt_ptr),
byref(tmp_deltas_ptr))
self.temporal = np.asarray([tmp_filt_ptr[i] for i in range(steps)],
dtype=np.double)
flam3_free(tmp_filt_ptr)
self.temporal_deltas = np.asarray(
[tmp_deltas_ptr[i] for i in range(steps)], dtype=np.double)
flam3_free(tmp_deltas_ptr)
# TODO: density estimation
self.gutter = (spa_width - self.oversample) / 2
class Features(object):
"""
Determine features and constants required to render a particular set of
genomes. The values of this class are fixed before compilation begins.
"""
# Constant; number of rounds spent fusing points on first CP of a frame
num_fuse_samples = 25
def __init__(self, genomes, flt):
any = lambda l: bool(filter(None, map(l, genomes)))
self.max_ntemporal_samples = max(
[cp.nbatches * cp.ntemporal_samples for cp in genomes])
self.camera_rotation = any(lambda cp: cp.rotate)
self.non_box_temporal_filter = genomes[0].temporal_filter_type
self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
# Histogram (and log-density copy) width and height
self.hist_width = flt.oversample * genomes[0].width + 2 * flt.gutter
self.hist_height = flt.oversample * genomes[0].height + 2 * flt.gutter
# Histogram stride, for better filtering. This code assumes the
# 128-byte L1 cache line width of Fermi devices, and a 16-byte
# histogram bucket size. TODO: detect these things programmatically,
# particularly the histogram bucket size, which may be split soon
self.hist_stride = 8 * int(math.ceil(self.hist_width / 8.0))
class Camera(object):
"""Viewport and exposure."""
def __init__(self, frame, cp, filters):
# Calculate the conversion matrix between the IFS space (xform
# coordinates) and the sampling lattice (bucket addresses)
# TODO: test this code (against compute_camera?)
scale = 2.0 ** cp.zoom
self.sample_density = cp.sample_density * scale * scale
center = Point(cp._center[0], cp._center[1])
size = Point(cp.width, cp.height)
# pix per unit, where 'unit' is '1.0' in IFS space
self.ppu = Point(
cp.pixels_per_unit * scale / frame.pixel_aspect_ratio,
cp.pixels_per_unit * scale)
# extra shifts applied due to gutter
gutter = filters.gutter / (cp.spatial_oversample * self.ppu)
cornerLL = center - (size / (2 * self.ppu))
self.lower_bounds = cornerLL - gutter
self.upper_bounds = cornerLL + (size / self.ppu) + gutter
self.norm_scale = 1.0 / (self.upper_bounds - self.lower_bounds)
self.norm_offset = -self.norm_scale * self.lower_bounds
self.idx_scale = size * self.norm_scale
self.idx_offset = size * self.norm_offset