Rearrange the main render loop... again.

Using one stream with two pagelocked host buffers allows us to keep the
GPU work queue full without pegging the CPU, and also reduces the
incidences where a host buffer will get overwritten before it can be
written. devtid() was flaky, so this patch also introduces a ringbuffer
to handle the 'slots' concept. It also introduces an adaptive number of
temporal samples, which improves efficiency but also killed the
assumption that (ntemporal_samples % 256 == 0), which required some
additional fixes.
This commit is contained in:
Steven Robertson 2011-10-28 08:30:36 -04:00
parent 15f88383b1
commit 185823ad55
5 changed files with 127 additions and 113 deletions

View File

@ -227,8 +227,9 @@ class GenomePacker(HunkOCode):
__global__ __global__
void interp_{{tname}}({{tname}}* out, float *times, float *knots, void interp_{{tname}}({{tname}}* out, float *times, float *knots,
float tstart, float tstep, mwc_st *rctxes) { float tstart, float tstep, mwc_st *rctxes, int maxid) {
int id = gtid(); int id = gtid();
if (id >= maxid) return;
out = &out[id]; out = &out[id];
mwc_st rctx = rctxes[id]; mwc_st rctx = rctxes[id];
float time = tstart + id * tstep; float time = tstart + id * tstep;

View File

@ -128,6 +128,7 @@ class IterCode(HunkOCode):
// Note: for normalized lookups, uchar4 actually returns floats // Note: for normalized lookups, uchar4 actually returns floats
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex; texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
__shared__ iter_params params; __shared__ iter_params params;
__device__ int rb_head, rb_tail, rb_size;
""" """
@ -241,10 +242,15 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
def _iterbody(self): def _iterbody(self):
tmpl = Template(r''' tmpl = Template(r'''
__global__ void reset_rb(int size) {
rb_head = rb_tail = 0;
rb_size = size;
}
__global__ __global__
void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points, void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
const iter_params *all_params, int nsamps_to_generate) { const iter_params *all_params, int nsamps_to_generate) {
mwc_st rctx = msts[devtid()];
const iter_params *global_params = &(all_params[blockIdx.x]); const iter_params *global_params = &(all_params[blockIdx.x]);
__shared__ int nsamps; __shared__ int nsamps;
@ -259,6 +265,17 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
reinterpret_cast<float*>(&params)[i] = reinterpret_cast<float*>(&params)[i] =
reinterpret_cast<const float*>(global_params)[i]; reinterpret_cast<const float*>(global_params)[i];
__shared__ int rb_idx;
if (threadIdx.x == 1 && threadIdx.y == 1)
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
__syncthreads();
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
mwc_st rctx = msts[this_rb_idx];
float4 old_point = points[this_rb_idx];
float x = old_point.x, y = old_point.y,
color = old_point.z, fuse_rounds = old_point.w;
{{if info.chaos_used}} {{if info.chaos_used}}
int last_xf_used = 0; int last_xf_used = 0;
{{else}} {{else}}
@ -270,12 +287,9 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
// This is normally done after the swap-sync in the main loop // This is normally done after the swap-sync in the main loop
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
__syncthreads();
{{endif}} {{endif}}
__syncthreads();
float4 old_point = points[devtid()];
float x = old_point.x, y = old_point.y,
color = old_point.z, fuse_rounds = old_point.w;
while (1) { while (1) {
// This condition checks for large numbers, Infs, and NaNs. // This condition checks for large numbers, Infs, and NaNs.
@ -383,8 +397,20 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
__syncthreads(); __syncthreads();
if (nsamps <= 0) break; if (nsamps <= 0) break;
} }
points[devtid()] = make_float4(x, y, color, fuse_rounds);
msts[devtid()] = rctx; if (threadIdx.x == 0 && threadIdx.y == 0)
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
__syncthreads();
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
points[this_rb_idx] = make_float4(x, y, color, fuse_rounds);
msts[this_rb_idx] = rctx;
return;
/*if (rctx.state == 0xffffffff && rctx.carry == 0xffffffff) {
printf("Warning: runaway sequence, multiplier %8x.\n", rctx.mul);
rctx.state = gtid();
}*/
} }
''') ''')
return tmpl.substitute( return tmpl.substitute(

View File

@ -80,28 +80,6 @@ uint32_t gtid() {
(blockIdx.x + (gridDim.x * blockIdx.y)))); (blockIdx.x + (gridDim.x * blockIdx.y))));
} }
/* Returns the ID of this thread on the device. Note that this counter is
* volatile according to the PTX ISA. It should be used for loading and saving
* state that must be unique across running threads, not for accessing things
* in a known order. */
__device__
int devtid() {
int result;
asm({{crep('''
{
.reg .u32 tmp1, tmp2;
mov.u32 %0, %smid;
mov.u32 tmp1, %nsmid;
mov.u32 tmp2, %warpid;
mad.lo.u32 %0, %0, tmp1, tmp2;
mov.u32 tmp1, %nwarpid;
mov.u32 tmp2, %laneid;
mad.lo.u32 %0, %0, tmp1, tmp2;
}''')}} : "=r"(result) );
return result;
}
__device__ __device__
uint32_t trunca(float f) { uint32_t trunca(float f) {
// truncate as used in address calculations. note the use of a signed // truncate as used in address calculations. note the use of a signed

View File

@ -4,6 +4,7 @@ import math
import re import re
import time as timemod import time as timemod
import tempfile import tempfile
from collections import namedtuple
from itertools import cycle, repeat, chain, izip from itertools import cycle, repeat, chain, izip
from ctypes import * from ctypes import *
from cStringIO import StringIO from cStringIO import StringIO
@ -14,7 +15,6 @@ from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import * from fr0stlib.pyflam3.constants import *
import pycuda.autoinit
import pycuda.compiler import pycuda.compiler
import pycuda.driver as cuda import pycuda.driver as cuda
import pycuda.tools import pycuda.tools
@ -24,6 +24,8 @@ import cuburn.genome
from cuburn import affine from cuburn import affine
from cuburn.code import util, mwc, iter, filtering from cuburn.code import util, mwc, iter, filtering
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
class Renderer(object): class Renderer(object):
""" """
Control structure for rendering a series of frames. Control structure for rendering a series of frames.
@ -109,38 +111,62 @@ class Renderer(object):
def render(self, times): def render(self, times):
""" """
Render a flame for each genome in the iterable value 'genomes'. Render a flame for each genome in the iterable value 'genomes'.
Returns a Python generator object which will yield a 2-tuple of Returns a RenderedImage object with the rendered buffer in the
``(time, buf)``, where ``time`` is the start time of the frame and requested format (3D RGBA ndarray only for now).
``buf`` is a 3D (width, height, channel) NumPy array containing
[0,1]-valued RGBA components.
This method produces a considerable amount of side effects, and should This method produces a considerable amount of side effects, and should
not be used lightly. Things may go poorly for you if this method is not not be used lightly. Things may go poorly for you if this method is not
allowed to run until completion (by exhausting all items in the allowed to run until completion (by exhausting all items in the
generator object). generator object).
``times`` is a sequence of (start, stop) times defining the temporal ``times`` is a sequence of (idx, start, stop) times, where index is
range to be rendered for each frame. This will change to be more the logical frame number (though it can be any value) and 'start' and
frame-centric in the future, allowing for interpolated temporal width. 'stop' together define the time range to be rendered for each frame.
""" """
if times == []: if times == []:
return return
reset_rb_fun = self.mod.get_function("reset_rb")
packer_fun = self.mod.get_function("interp_iter_params")
palette_fun = self.mod.get_function("interp_palette_hsv")
iter_fun = self.mod.get_function("iter")
info = self.info info = self.info
iter_stream = cuda.Stream() stream = cuda.Stream()
filt_stream = cuda.Stream() event_a = cuda.Event().record(stream)
event_b = None
nbins = info.acc_height * info.acc_stride nbins = info.acc_height * info.acc_stride
d_accum = cuda.mem_alloc(16 * nbins) d_accum = cuda.mem_alloc(16 * nbins)
d_out = cuda.mem_alloc(16 * nbins) d_out = cuda.mem_alloc(16 * nbins)
num_sm = cuda.Context.get_device().multiprocessor_count # Calculate 'nslots', the number of simultaneous running threads that
cps_per_block = 1024 # can be active on the GPU during iteration (and thus the number of
# slots for loading and storing RNG and point context that will be
# prepared on the device), 'rb_size' (the number of blocks in
# 'nslots'), and determine a number of temporal samples
# likely to load-balance effectively
iter_threads_per_block = 256
dev_data = pycuda.tools.DeviceData()
occupancy = pycuda.tools.OccupancyRecord(
dev_data, iter_threads_per_block,
iter_fun.shared_size_bytes, iter_fun.num_regs)
nsms = cuda.Context.get_device().multiprocessor_count
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
nslots = iter_threads_per_block * rb_size
ntemporal_samples = int(np.ceil(1000. / rb_size) * rb_size)
# Reset the ringbuffer info for the slots
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
d_points = cuda.mem_alloc(nslots * 16)
seeds = mwc.MWC.make_seeds(nslots)
d_seeds = cuda.to_device(seeds)
genome_times, genome_knots = self._iter.packer.pack() genome_times, genome_knots = self._iter.packer.pack()
d_genome_times = cuda.to_device(genome_times) d_genome_times = cuda.to_device(genome_times)
d_genome_knots = cuda.to_device(genome_knots) d_genome_knots = cuda.to_device(genome_knots)
info_size = 4 * len(self._iter.packer) * cps_per_block info_size = 4 * len(self._iter.packer) * ntemporal_samples
d_infos = cuda.mem_alloc(info_size) d_infos = cuda.mem_alloc(info_size)
pals = info.genome.color.palette pals = info.genome.color.palette
@ -154,91 +180,62 @@ class Renderer(object):
np.concatenate(map(info.db.palettes.get, pals[1::2]))) np.concatenate(map(info.db.palettes.get, pals[1::2])))
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4) d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
# The '+1' avoids more situations where the 'smid' value is larger pal_array_info = cuda.ArrayDescriptor()
# than the number of enabled SMs on a chip, which is warned against in pal_array_info.height = info.palette_height
# the docs but not seen in the wild. Things could get nastier on pal_array_info.width = 256
# subsequent silicon, but I doubt they'd ever kill more than 1 SM pal_array_info.array_format = cuda.array_format.UNSIGNED_INT8
nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \ pal_array_info.num_channels = 4
(pycuda.autoinit.device.multiprocessor_count + 1)
d_points = cuda.mem_alloc(nslots * 16) h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
seeds = mwc.MWC.make_seeds(nslots) np.float32)
d_seeds = cuda.to_device(seeds) h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
np.float32)
last_idx = None
h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), for idx, start, stop in times:
np.float32)
filter_done_event = None
packer_fun = self.mod.get_function("interp_iter_params")
palette_fun = self.mod.get_function("interp_palette_hsv")
iter_fun = self.mod.get_function("iter")
#iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
last_time = times[0][0]
for start, stop in times:
cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2) cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2)
if filter_done_event:
iter_stream.wait_for_event(filter_done_event)
width = np.float32((stop-start) / info.palette_height) width = np.float32((stop-start) / info.palette_height)
palette_fun(d_palmem, d_palint_times, d_palint_vals, palette_fun(d_palmem, d_palint_times, d_palint_vals,
np.float32(start), width, np.float32(start), width,
block=(256,1,1), grid=(info.palette_height,1), block=(256,1,1), grid=(info.palette_height,1),
stream=iter_stream) stream=stream)
# TODO: do we need to do this each time in order to reset cache?
tref = self.mod.get_texref('palTex') tref = self.mod.get_texref('palTex')
array_info = cuda.ArrayDescriptor() tref.set_address_2d(d_palmem, pal_array_info, 1024)
array_info.height = info.palette_height
array_info.width = 256
array_info.array_format = cuda.array_format.UNSIGNED_INT8
array_info.num_channels = 4
tref.set_address_2d(d_palmem, array_info, 1024)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4) tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR) tref.set_filter_mode(cuda.filter_mode.LINEAR)
width = np.float32((stop-start) / cps_per_block) width = np.float32((stop-start) / ntemporal_samples)
packer_fun(d_infos, d_genome_times, d_genome_knots, packer_fun(d_infos, d_genome_times, d_genome_knots,
np.float32(start), width, d_seeds, np.float32(start), width, d_seeds,
block=(256,1,1), grid=(cps_per_block/256,1), np.int32(ntemporal_samples), block=(256,1,1),
stream=iter_stream) grid=(int(np.ceil(ntemporal_samples/256.)),1),
stream=stream)
# TODO: if we only do this once per anim, does quality improve? # TODO: if we only do this once per anim, does quality improve?
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots, util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, np.float32(np.nan)) stream, np.float32(np.nan))
# Get interpolated control points for debugging # Get interpolated control points for debugging
#iter_stream.synchronize() #stream.synchronize()
#d_temp = cuda.from_device(d_infos, #d_temp = cuda.from_device(d_infos,
#(cps_per_block, len(self._iter.packer)), np.float32) #(ntemporal_samples, len(self._iter.packer)), np.float32)
#for i, n in zip(d_temp[5], self._iter.packer.packed): #for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i) #print '%60s %g' % ('_'.join(n), i)
nsamps = info.density * info.width * info.height / cps_per_block util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, stream)
nsamps = info.density * info.width * info.height / ntemporal_samples
iter_fun(np.uint64(d_accum), d_seeds, d_points, iter_fun(np.uint64(d_accum), d_seeds, d_points,
d_infos, np.int32(nsamps), d_infos, np.int32(nsamps),
block=(32, self._iter.NTHREADS/32, 1), block=(32, self._iter.NTHREADS/32, 1),
grid=(cps_per_block, 1), grid=(ntemporal_samples, 1),
texrefs=[tref], stream=iter_stream) texrefs=[tref], stream=stream)
iter_stream.synchronize() util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream)
if filter_done_event: self._de.invoke(self.mod, cen_cp, d_accum, d_out, stream)
while not filt_stream.is_done():
timemod.sleep(0.01)
filt_stream.synchronize()
yield last_time, self._trim(h_out)
last_time = start
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
self._de.invoke(self.mod, cen_cp, d_accum, d_out, filt_stream)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
filter_done_event = cuda.Event().record(filt_stream)
f32 = np.float32 f32 = np.float32
# TODO: implement integration over cubic splines? # TODO: implement integration over cubic splines?
@ -255,12 +252,24 @@ class Renderer(object):
color_fun = self.mod.get_function("colorclip") color_fun = self.mod.get_function("colorclip")
blocks = int(np.ceil(np.sqrt(nbins / 256))) blocks = int(np.ceil(np.sqrt(nbins / 256)))
color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins), color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
block=(256, 1, 1), grid=(blocks, blocks), block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
stream=filt_stream) cuda.memcpy_dtoh_async(h_out_a, d_out, stream)
cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
filt_stream.synchronize() if event_b:
yield start, self._trim(h_out) while not event_a.query():
timemod.sleep(0.01)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
event_a, event_b = cuda.Event().record(stream), event_a
h_out_a, h_out_b = h_out_b, h_out_a
last_idx = idx
while not event_a.query():
timemod.sleep(0.001)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
def _trim(self, result): def _trim(self, result):
g = self.info.gutter g = self.info.gutter

View File

@ -27,12 +27,13 @@ np.set_printoptions(precision=5, edgeitems=20)
real_stdout = sys.stdout real_stdout = sys.stdout
def save(time, raw, pfx): def save(rimg, pfx):
noalpha = raw[:,:,:3] noalpha = rimg.buf[:,:,:3]
name = pfx + '%05d' % time name = pfx + str(rimg.idx)
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1) img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
img.save(name+'.png') img.save(name+'.png')
print name print name, rimg.gpu_time
sys.stdout.flush()
def main(jobfilepath, outprefix): def main(jobfilepath, outprefix):
# This includes the genomes and other cruft, a dedicated reader will be # This includes the genomes and other cruft, a dedicated reader will be
@ -40,16 +41,15 @@ def main(jobfilepath, outprefix):
info = cuburn.genome.load_info(open(jobfilepath).read()) info = cuburn.genome.load_info(open(jobfilepath).read())
times = np.linspace(0, 1, info.duration * info.fps + 1) times = np.linspace(0, 1, info.duration * info.fps + 1)
#rtimes = zip(['%05d' % i for i in range(len(times))[1:]], times, times[1:])
# One still, one motion-blurred for testing rtimes = [('still', times[0], times[0]), ('motion', times[1], times[2])]
rtimes = [(times[0], times[0]), (times[1], times[2])]
renderer = cuburn.render.Renderer(info) renderer = cuburn.render.Renderer(info)
renderer.compile() renderer.compile()
renderer.load() renderer.load()
for idx, (ftime, out) in enumerate(renderer.render(rtimes)): for out in renderer.render(rtimes):
save(idx, out, outprefix) save(out, outprefix)
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else 'out/') main(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else 'out/')