mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
225 lines
9.1 KiB
Python
225 lines
9.1 KiB
Python
import math
|
|
from ctypes import *
|
|
from cStringIO import StringIO
|
|
import numpy as np
|
|
|
|
from fr0stlib import pyflam3
|
|
from fr0stlib.pyflam3._flam3 import *
|
|
from fr0stlib.pyflam3.constants import *
|
|
|
|
from cuburnlib.cuda import LaunchContext
|
|
from cuburnlib.device_code import *
|
|
|
|
Point = lambda x, y: np.array([x, y], dtype=np.double)
|
|
|
|
class Genome(pyflam3.Genome):
|
|
pass
|
|
|
|
class _Frame(pyflam3.Frame):
|
|
"""
|
|
ctypes flam3_frame object used for genome interpolation and
|
|
spatial filter creation
|
|
"""
|
|
def __init__(self, genomes, *args, **kwargs):
|
|
pyflam3.Frame.__init__(self, *args, **kwargs)
|
|
self.genomes = (BaseGenome * len(genomes))()
|
|
for i in range(len(genomes)):
|
|
memmove(byref(self.genomes[i]), byref(genomes[i]),
|
|
sizeof(BaseGenome))
|
|
self.ngenomes = len(genomes)
|
|
|
|
# TODO: do this here?
|
|
self.pixel_aspect_ratio = float(genomes[0].height) / genomes[0].width
|
|
|
|
def interpolate(self, time, stagger=0, cp=None):
|
|
cp = cp or BaseGenome()
|
|
flam3_interpolate(self.genomes, self.ngenomes, time,
|
|
stagger, byref(cp))
|
|
return cp
|
|
|
|
class Frame(object):
|
|
"""
|
|
Handler for a single frame of a rendered genome.
|
|
"""
|
|
def __init__(self, _frame, time):
|
|
self._frame = _frame
|
|
self.center_cp = self._frame.interpolate(time)
|
|
|
|
def upload_data(self, ctx, filters, time):
|
|
"""
|
|
Prepare and upload the data needed to render this frame to the device.
|
|
"""
|
|
center = self.center_cp
|
|
ncps = center.nbatches * center.ntemporal_samples
|
|
|
|
if ncps < ctx.ctas:
|
|
raise NotImplementedError(
|
|
"Distribution of a CP across multiple CTAs not yet done")
|
|
|
|
# TODO: isn't this leaking ctypes xforms all over the place?
|
|
stream = StringIO()
|
|
cp_list = []
|
|
|
|
for batch_idx in range(center.nbatches):
|
|
for time_idx in range(center.ntemporal_samples):
|
|
idx = time_idx + batch_idx * center.nbatches
|
|
time = time + filters.temporal_deltas[idx]
|
|
cp = self._frame.interpolate(time)
|
|
cp_list.append(cp)
|
|
|
|
cp.camera = Camera(self._frame, cp, filters)
|
|
cp.nsamples = (cp.camera.sample_density *
|
|
center.width * center.height) / ncps
|
|
|
|
print "Expected writes:", (
|
|
cp.camera.sample_density * center.width * center.height)
|
|
min_time = min(filters.temporal_deltas)
|
|
max_time = max(filters.temporal_deltas)
|
|
for i, cp in enumerate(cp_list):
|
|
cp.norm_time = (filters.temporal_deltas[i] - min_time) / (
|
|
max_time - min_time)
|
|
CPDataStream.pack_into(ctx, stream, frame=self, cp=cp, cp_idx=idx)
|
|
PaletteLookup.upload_palette(ctx, self, cp_list)
|
|
stream.seek(0)
|
|
IterThread.upload_cp_stream(ctx, stream.read(), ncps)
|
|
|
|
class Animation(object):
|
|
"""
|
|
Control structure for rendering a series of frames.
|
|
|
|
Each animation will dynamically generate a kernel that includes only the
|
|
code necessary to render the genomes provided. The process of generating
|
|
and uploading the kernel takes a small but finite amount of time. In
|
|
general, the kernel generated for all genomes resulting from interpolating
|
|
between two control points will have identical performance, so it is
|
|
wasteful to create more than one animation for any interpolated sequence.
|
|
|
|
However, genome sequences interpolated from three or more control points
|
|
with different features enabled will have the code needed to render all
|
|
genomes enabled for every frame. Doing this can hurt performance.
|
|
|
|
In other words, it's best to use exactly one Animation for each
|
|
interpolated sequence between one or two genomes.
|
|
"""
|
|
def __init__(self, genomes):
|
|
# _frame is the ctypes frame object used only for interpolation
|
|
self._frame = _Frame(genomes)
|
|
|
|
# Use the same set of filters throughout the anim, a la flam3
|
|
self.filters = Filters(self._frame, genomes[0])
|
|
self.features = Features(genomes, self.filters)
|
|
|
|
self.ctx = None
|
|
|
|
def compile(self):
|
|
"""
|
|
Create a PTX kernel optimized for this animation, compile it, and
|
|
attach it to a LaunchContext with a thread distribution optimized for
|
|
the active device.
|
|
"""
|
|
# TODO: user-configurable test control
|
|
self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
|
|
tests=True)
|
|
# TODO: user-configurable verbosity control
|
|
self.ctx.compile(verbose=3, anim=self, features=self.features)
|
|
# TODO: automatic optimization of block parameters
|
|
|
|
def render_frame(self, time=0):
|
|
# TODO: support more nuanced frame control than just 'time'
|
|
# TODO: reuse more information between frames
|
|
# TODO: allow animation-long override of certain parameters (size, etc)
|
|
frame = Frame(self._frame, time)
|
|
frame.upload_data(self.ctx, self.filters, time)
|
|
self.ctx.set_up()
|
|
IterThread.call(self.ctx)
|
|
return HistScatter.get_bins(self.ctx, self.features)
|
|
|
|
class Filters(object):
|
|
def __init__(self, frame, cp):
|
|
# Use one oversample per filter set, even over multiple timesteps
|
|
self.oversample = frame.genomes[0].spatial_oversample
|
|
|
|
# Ugh. I'd really like to replace this mess
|
|
spa_filt_ptr = POINTER(c_double)()
|
|
spa_width = flam3_create_spatial_filter(byref(frame),
|
|
flam3_field_both,
|
|
byref(spa_filt_ptr))
|
|
if spa_width < 0:
|
|
raise EnvironmentError("flam3 call failed")
|
|
self.spatial = np.asarray([[spa_filt_ptr[y*spa_width+x] for x in
|
|
range(spa_width)] for y in range(spa_width)], dtype=np.double)
|
|
self.spatial_width = spa_width
|
|
flam3_free(spa_filt_ptr)
|
|
|
|
tmp_filt_ptr = POINTER(c_double)()
|
|
tmp_deltas_ptr = POINTER(c_double)()
|
|
steps = cp.nbatches * cp.ntemporal_samples
|
|
self.temporal_sum = flam3_create_temporal_filter(
|
|
steps,
|
|
cp.temporal_filter_type,
|
|
cp.temporal_filter_exp,
|
|
cp.temporal_filter_width,
|
|
byref(tmp_filt_ptr),
|
|
byref(tmp_deltas_ptr))
|
|
self.temporal = np.asarray([tmp_filt_ptr[i] for i in range(steps)],
|
|
dtype=np.double)
|
|
flam3_free(tmp_filt_ptr)
|
|
self.temporal_deltas = np.asarray(
|
|
[tmp_deltas_ptr[i] for i in range(steps)], dtype=np.double)
|
|
flam3_free(tmp_deltas_ptr)
|
|
|
|
# TODO: density estimation
|
|
self.gutter = (spa_width - self.oversample) / 2
|
|
|
|
class Features(object):
|
|
"""
|
|
Determine features and constants required to render a particular set of
|
|
genomes. The values of this class are fixed before compilation begins.
|
|
"""
|
|
# Constant; number of rounds spent fusing points on first CP of a frame
|
|
num_fuse_samples = 25
|
|
|
|
def __init__(self, genomes, flt):
|
|
any = lambda l: bool(filter(None, map(l, genomes)))
|
|
self.max_ntemporal_samples = max(
|
|
[cp.nbatches * cp.ntemporal_samples for cp in genomes])
|
|
self.camera_rotation = any(lambda cp: cp.rotate)
|
|
self.non_box_temporal_filter = genomes[0].temporal_filter_type
|
|
self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
|
|
|
|
# Histogram (and log-density copy) width and height
|
|
self.hist_width = flt.oversample * genomes[0].width + 2 * flt.gutter
|
|
self.hist_height = flt.oversample * genomes[0].height + 2 * flt.gutter
|
|
# Histogram stride, for better filtering. This code assumes the
|
|
# 128-byte L1 cache line width of Fermi devices, and a 16-byte
|
|
# histogram bucket size. TODO: detect these things programmatically,
|
|
# particularly the histogram bucket size, which may be split soon
|
|
self.hist_stride = 8 * int(math.ceil(self.hist_width / 8.0))
|
|
|
|
class Camera(object):
|
|
"""Viewport and exposure."""
|
|
def __init__(self, frame, cp, filters):
|
|
# Calculate the conversion matrix between the IFS space (xform
|
|
# coordinates) and the sampling lattice (bucket addresses)
|
|
# TODO: test this code (against compute_camera?)
|
|
scale = 2.0 ** cp.zoom
|
|
self.sample_density = cp.sample_density * scale * scale
|
|
|
|
center = Point(cp._center[0], cp._center[1])
|
|
size = Point(cp.width, cp.height)
|
|
|
|
# pix per unit, where 'unit' is '1.0' in IFS space
|
|
self.ppu = Point(
|
|
cp.pixels_per_unit * scale / frame.pixel_aspect_ratio,
|
|
cp.pixels_per_unit * scale)
|
|
# extra shifts applied due to gutter
|
|
gutter = filters.gutter / (cp.spatial_oversample * self.ppu)
|
|
cornerLL = center - (size / (2 * self.ppu))
|
|
self.lower_bounds = cornerLL - gutter
|
|
self.upper_bounds = cornerLL + (size / self.ppu) + gutter
|
|
self.norm_scale = 1.0 / (self.upper_bounds - self.lower_bounds)
|
|
self.norm_offset = -self.norm_scale * self.lower_bounds
|
|
self.idx_scale = size * self.norm_scale
|
|
self.idx_offset = size * self.norm_offset
|
|
|