mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Finally runs again
This commit is contained in:
		@ -15,7 +15,8 @@ from cuburnlib.ptx import PTXModule
 | 
			
		||||
class LaunchContext(object):
 | 
			
		||||
    """
 | 
			
		||||
    Context collecting the information needed to create, run, and gather the
 | 
			
		||||
    results of a device computation.
 | 
			
		||||
    results of a device computation. This may eventually also include an actual
 | 
			
		||||
    CUDA context, but for now it just uses the global one.
 | 
			
		||||
 | 
			
		||||
    To create the fastest device code across multiple device families, this
 | 
			
		||||
    context may decide to iteratively refine the final PTX by regenerating
 | 
			
		||||
@ -32,34 +33,27 @@ class LaunchContext(object):
 | 
			
		||||
        `mod`:      Final compiled module. Unavailable during assembly.
 | 
			
		||||
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None,
 | 
			
		||||
                 tests=False):
 | 
			
		||||
    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
 | 
			
		||||
        self.entry_types = entries
 | 
			
		||||
        self.block, self.grid, self.build_tests = block, grid, tests
 | 
			
		||||
        self.rand = np.random.mtrand.RandomState(seed)
 | 
			
		||||
        self.setup_done = False
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def threads(self):
 | 
			
		||||
        return reduce(lambda a, b: a*b, self.block + self.grid)
 | 
			
		||||
 | 
			
		||||
    def print_source(self):
 | 
			
		||||
        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
 | 
			
		||||
                        enumerate(self.ptx.source.split('\n'))])
 | 
			
		||||
 | 
			
		||||
    def compile(self, to_inject={}, verbose=False):
 | 
			
		||||
        inj = dict(to_inject)
 | 
			
		||||
        inj['ctx'] = self
 | 
			
		||||
        self.ptx = PTXModule(self.entry_types, inj, self.build_tests)
 | 
			
		||||
    def compile(self, verbose=False, **kwargs):
 | 
			
		||||
        kwargs['ctx'] = self
 | 
			
		||||
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
 | 
			
		||||
        try:
 | 
			
		||||
            self.mod = cuda.module_from_buffer(self.ptx.source)
 | 
			
		||||
        except (cuda.CompileError, cuda.RuntimeError), e:
 | 
			
		||||
            print "Aww, dang, compile error. Here's the source:"
 | 
			
		||||
            self.print_source()
 | 
			
		||||
            self.ptx.print_source()
 | 
			
		||||
            raise e
 | 
			
		||||
        if verbose:
 | 
			
		||||
            if verbose >= 3:
 | 
			
		||||
                self.print_source()
 | 
			
		||||
                self.ptx.print_source()
 | 
			
		||||
            for entry in self.ptx.entries:
 | 
			
		||||
                func = self.mod.get_function(entry.entry_name)
 | 
			
		||||
                print "Compiled %s: used %d regs, %d sm, %d local" % (
 | 
			
		||||
 | 
			
		||||
@ -4,126 +4,34 @@ Contains the PTX fragments which will drive the device.
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import time
 | 
			
		||||
import struct
 | 
			
		||||
 | 
			
		||||
import pycuda.driver as cuda
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
from cuburnlib.ptx import *
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
Here's the current draft of the full algorithm implementation.
 | 
			
		||||
 | 
			
		||||
declare xform jump table
 | 
			
		||||
 | 
			
		||||
load random state
 | 
			
		||||
 | 
			
		||||
clear x_coord, y_coord, z_coord, w_coord;
 | 
			
		||||
store -(FUSE+1) to shared (per-warp) num_samples_sh
 | 
			
		||||
clear badvals [1]
 | 
			
		||||
 | 
			
		||||
load param (global_cp_idx_addr)
 | 
			
		||||
index table start (global_cp_idx) [2]
 | 
			
		||||
load count of indexes from global cp index =>
 | 
			
		||||
    store to qlocal current_cp_num [3]
 | 
			
		||||
 | 
			
		||||
outermost loop start:
 | 
			
		||||
    load current_cp_num
 | 
			
		||||
    if current_cp_num <= 0:
 | 
			
		||||
        exit
 | 
			
		||||
 | 
			
		||||
    load param global_cp_idx_addr
 | 
			
		||||
    calculate offset into address with current_cp_num, global_cp_idx_addr
 | 
			
		||||
    load cp_base_address
 | 
			
		||||
    stream_start (cp_base, cp_base_addr) [4]
 | 
			
		||||
 | 
			
		||||
FUSE_START:
 | 
			
		||||
    num_samples += 1
 | 
			
		||||
    if num_samples >= 0:
 | 
			
		||||
        # Okay, we're done FUSEing, prepare to enter normal loop
 | 
			
		||||
        load num_samples => store to shared (per-warp) num_samples
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ITER_LOOP_START:
 | 
			
		||||
        reg xform_addr, xform_stream_addr, xform_select
 | 
			
		||||
 | 
			
		||||
        mwc_next_u32 to xform_select
 | 
			
		||||
        # Performance test: roll/unroll this loop?
 | 
			
		||||
        stream_load xform_prob (cp_stream)
 | 
			
		||||
        if xform_select <= xform_prob:
 | 
			
		||||
            bra.uni XFORM_1_LBL
 | 
			
		||||
        ...
 | 
			
		||||
        stream_load xform_prob (cp_stream)
 | 
			
		||||
        if xform_select <= xform_prob:
 | 
			
		||||
            bra.uni XFORM_N_LBL
 | 
			
		||||
 | 
			
		||||
XFORM_1_LBL:
 | 
			
		||||
        stream_load xform_1_ (cp_stream)
 | 
			
		||||
        ...
 | 
			
		||||
        bra.uni XFORM_POST
 | 
			
		||||
 | 
			
		||||
XFORM_POST:
 | 
			
		||||
        [if final_xform:]
 | 
			
		||||
            [do final_xform]
 | 
			
		||||
 | 
			
		||||
        if num_samples < 0:
 | 
			
		||||
            # FUSE still in progress
 | 
			
		||||
            bra.uni FUSE_START
 | 
			
		||||
 | 
			
		||||
FRAGMENT_WRITEBACK:
 | 
			
		||||
        # Unknown at this time.
 | 
			
		||||
 | 
			
		||||
SHUFFLE:
 | 
			
		||||
        # Unknown at this time.
 | 
			
		||||
 | 
			
		||||
        load num_samples from num_samples_sh
 | 
			
		||||
        num_samples -= 1
 | 
			
		||||
        if num_samples > 0:
 | 
			
		||||
            bra.uni ITER_LOOP_START
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[1] Tracking 'badvals' can put a pretty large hit on performance, particularly
 | 
			
		||||
    for images that sample a small amount of the grid. So this might be cut
 | 
			
		||||
    when rendering for performance. On the other hand, it might actually help
 | 
			
		||||
    tune the algorithm later, so it'll definitely be an option.
 | 
			
		||||
 | 
			
		||||
[2] Control points for each temporal sample will be preloaded to the
 | 
			
		||||
    device in the compact DataStream format (more on this later). Their
 | 
			
		||||
    locations are represented in an index table, which starts with a single
 | 
			
		||||
    `.u32 length`, followed by `length` pointers. To avoid having to keep
 | 
			
		||||
    reloading `length`, or worse, using a register to hold it in memory, we
 | 
			
		||||
    instead count *down* to zero. This is a very common idiom.
 | 
			
		||||
 | 
			
		||||
[3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
 | 
			
		||||
    depending on how local storage is implemented, but the extra 128-byte loads
 | 
			
		||||
    for such values might make a performance difference. qlocal variables may
 | 
			
		||||
    be identical across a warp or even a CTA, and so variables noted as
 | 
			
		||||
    "qlocal" here might end up in shared memory or even a small per-warp or
 | 
			
		||||
    per-CTA buffer in global memory created specifically for this purpose,
 | 
			
		||||
    after benchmarking is done.
 | 
			
		||||
 | 
			
		||||
[4] DataStreams are "opaque" data serialization structures defined below.  The
 | 
			
		||||
    structure of a stream is actually created while parsing the DSL by the load
 | 
			
		||||
    statements themselves. Some benchmarks need to be done before DataStreams
 | 
			
		||||
    stop being "opaque" and become simply "dynamic".
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
class IterThread(PTXTest):
 | 
			
		||||
    entry_name = 'iter_thread'
 | 
			
		||||
    entry_params = []
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.cps_uploaded = False
 | 
			
		||||
 | 
			
		||||
    def deps(self):
 | 
			
		||||
        return [MWCRNG, CPDataStream]
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def module_setup(self):
 | 
			
		||||
        mem.global_.u32('g_cp_array',
 | 
			
		||||
                [features.max_ntemporal_samples,'*',cp_stream_size])
 | 
			
		||||
                        cp_stream_size*features.max_ntemporal_samples)
 | 
			
		||||
        mem.global_.u32('g_num_cps')
 | 
			
		||||
        # TODO move into debug statement
 | 
			
		||||
        mem.global_.u32('g_num_rounds', ctx.threads)
 | 
			
		||||
        mem.global_.u32('g_num_writes', ctx.threads)
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def entry():
 | 
			
		||||
    def entry(self):
 | 
			
		||||
        reg.f32('x_coord y_coord color_coord alpha_coord')
 | 
			
		||||
 | 
			
		||||
        # TODO: temporary, for testing
 | 
			
		||||
@ -158,8 +66,8 @@ class IterThread(PTXTest):
 | 
			
		||||
        op.mov.s32(num_samples, -(features.num_fuse_samples+1))
 | 
			
		||||
 | 
			
		||||
        # TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
 | 
			
		||||
        reg.u32('cp_num cpA')
 | 
			
		||||
        mov.u32(cp_num, 0)
 | 
			
		||||
        reg.u32('cp_idx cpA')
 | 
			
		||||
        op.mov.u32(cp_idx, 0)
 | 
			
		||||
 | 
			
		||||
        label('cp_loop_start')
 | 
			
		||||
        op.bar.sync(0)
 | 
			
		||||
@ -168,19 +76,19 @@ class IterThread(PTXTest):
 | 
			
		||||
            reg.u32('num_cps')
 | 
			
		||||
            reg.pred('p_last_cp')
 | 
			
		||||
            op.ldu.u32(num_cps, addr(g_num_cps))
 | 
			
		||||
            op.setp.lt.u32(p_last_cp, cp_num, num_cps)
 | 
			
		||||
            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
 | 
			
		||||
            op.bra.uni('all_cps_done', ifp=p_last_cp)
 | 
			
		||||
 | 
			
		||||
        with block('Load CP address'):
 | 
			
		||||
            op.mov.u32(cpA, g_cp_array)
 | 
			
		||||
            op.mad.lo.u32(cpA, cp_num, cp_stream_size, cpA)
 | 
			
		||||
            op.mad.lo.u32(cpA, cp_idx, cp_stream_size, cpA)
 | 
			
		||||
 | 
			
		||||
        with block('Increment CP number, load num_samples (unless in fuse)'):
 | 
			
		||||
            reg.pred('p_in_fuse')
 | 
			
		||||
            op.setp.lt.s32(p_in_fuse, num_samples, 0)
 | 
			
		||||
            op.add.u32(cp_num, cp_num, 1, ifp=p_in_fuse)
 | 
			
		||||
            cp_stream_get(cpA, num_samples, 'cp.samples_per_thread',
 | 
			
		||||
                          ifp=p_in_fuse)
 | 
			
		||||
        with block('Increment CP index, load num_samples (unless in fuse)'):
 | 
			
		||||
            reg.pred('p_not_in_fuse')
 | 
			
		||||
            op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
 | 
			
		||||
            op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
 | 
			
		||||
            cp_stream_get(cpA, num_samples, 'samples_per_thread',
 | 
			
		||||
                          ifp=p_not_in_fuse)
 | 
			
		||||
 | 
			
		||||
        label('fuse_loop_start')
 | 
			
		||||
        with block('FUSE-specific stuff'):
 | 
			
		||||
@ -188,7 +96,7 @@ class IterThread(PTXTest):
 | 
			
		||||
            comment('If num_samples == -1, set it to 0 and jump back up')
 | 
			
		||||
            comment('This will start the normal CP loading machinery')
 | 
			
		||||
            op.setp.eq.s32(p_fuse, num_samples, -1)
 | 
			
		||||
            op.mov.s32(p_fuse, 0, ifp=p_fuse)
 | 
			
		||||
            op.mov.s32(num_samples, 0, ifp=p_fuse)
 | 
			
		||||
            op.bra.uni(cp_loop_start, ifp=p_fuse)
 | 
			
		||||
 | 
			
		||||
            comment('If num_samples < -1, still fusing, so increment')
 | 
			
		||||
@ -204,33 +112,55 @@ class IterThread(PTXTest):
 | 
			
		||||
        with block("Test if we're still in FUSE"):
 | 
			
		||||
            reg.pred('p_in_fuse')
 | 
			
		||||
            op.setp.lt.s32(p_in_fuse, num_samples, 0)
 | 
			
		||||
            op.bra.uni(fuse_start, ifp=p_in_fuse)
 | 
			
		||||
            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
 | 
			
		||||
 | 
			
		||||
        with block("Ordinarily, we'd write the result here"):
 | 
			
		||||
            op.add.u32(num_writes, num_writes, 1)
 | 
			
		||||
 | 
			
		||||
        with block("Check to see if we're done with this CP"):
 | 
			
		||||
            reg.pred('p_cp_done')
 | 
			
		||||
            op.add.s32(num_samples, num_samples, -1)
 | 
			
		||||
            op.setp.eq.s32(p_cp_done, num_samples, 0)
 | 
			
		||||
            op.bra.uni(cp_loop_start, ifp=p_cp_done)
 | 
			
		||||
 | 
			
		||||
        op.bra.uni(iter_loop_start)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        label('all_cps_done')
 | 
			
		||||
        # TODO this is for testing, move it to a debug statement
 | 
			
		||||
        store_per_thread(g_num_rounds, num_rounds)
 | 
			
		||||
        store_per_thread(g_num_writes, num_writes)
 | 
			
		||||
 | 
			
		||||
    def call(self, ctx):
 | 
			
		||||
        raise HorribleDeathError("Okay I'm going to bed now")
 | 
			
		||||
    def upload_cp_stream(self, ctx, cp_stream, num_cps):
 | 
			
		||||
        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
 | 
			
		||||
        assert len(cp_stream) <= cp_array_l, "Stream too big!"
 | 
			
		||||
        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
 | 
			
		||||
        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
 | 
			
		||||
        cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
 | 
			
		||||
        self.cps_uploaded = True
 | 
			
		||||
 | 
			
		||||
    def call(self, ctx):
 | 
			
		||||
        if not self.cps_uploaded:
 | 
			
		||||
            raise Error("Cannot call IterThread before uploading CPs")
 | 
			
		||||
        func = ctx.mod.get_function('iter_thread')
 | 
			
		||||
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
 | 
			
		||||
 | 
			
		||||
        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
 | 
			
		||||
        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
 | 
			
		||||
        rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
 | 
			
		||||
        writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
 | 
			
		||||
        print "Rounds:", rounds
 | 
			
		||||
        print "Writes:", writes
 | 
			
		||||
 | 
			
		||||
class MWCRNG(PTXFragment):
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.rand = np.random
 | 
			
		||||
        self.threads_ready = 0
 | 
			
		||||
        if not os.path.isfile('primes.bin'):
 | 
			
		||||
            raise EnvironmentError('primes.bin not found')
 | 
			
		||||
 | 
			
		||||
    def set_seed(self, seed):
 | 
			
		||||
        self.rand = np.random.mtrand.RandomState(seed)
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def module_setup(self):
 | 
			
		||||
        mem.global_.u32('mwc_rng_mults', ctx.threads)
 | 
			
		||||
@ -284,13 +214,13 @@ class MWCRNG(PTXFragment):
 | 
			
		||||
        # Randomness in choosing multipliers is good, but larger multipliers
 | 
			
		||||
        # have longer periods, which is also good. This is a compromise.
 | 
			
		||||
        mults = np.array(mults[:ctx.threads*4])
 | 
			
		||||
        ctx.rand.shuffle(mults)
 | 
			
		||||
        self.rand.shuffle(mults)
 | 
			
		||||
        # Copy multipliers and seeds to the device
 | 
			
		||||
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
 | 
			
		||||
        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
 | 
			
		||||
        # Intentionally excludes both 0 and (2^32-1), as they can lead to
 | 
			
		||||
        # degenerate sequences of period 0
 | 
			
		||||
        states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
 | 
			
		||||
        states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
 | 
			
		||||
                          dtype=np.uint32)
 | 
			
		||||
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
 | 
			
		||||
        cuda.memcpy_htod_async(statedp, states.tostring())
 | 
			
		||||
@ -376,7 +306,7 @@ class MWCRNGTest(PTXTest):
 | 
			
		||||
class CameraCoordTransform(PTXFragment):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
class CPDataStream(PTXFragment):
 | 
			
		||||
class CPDataStream(DataStream):
 | 
			
		||||
    """DataStream which stores the control points."""
 | 
			
		||||
    prefix = 'cp'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										125
									
								
								cuburnlib/ptx.py
									
									
									
									
									
								
							
							
						
						
									
										125
									
								
								cuburnlib/ptx.py
									
									
									
									
									
								
							@ -11,7 +11,7 @@ easier to maintain using this system.
 | 
			
		||||
# If you see 'import inspect', you know you're in for a good time
 | 
			
		||||
import inspect
 | 
			
		||||
import types
 | 
			
		||||
import traceback
 | 
			
		||||
import struct
 | 
			
		||||
from cStringIO import StringIO
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
 | 
			
		||||
@ -116,6 +116,8 @@ class _BlockInjector(object):
 | 
			
		||||
        self.dead = False
 | 
			
		||||
        map(self.inject, self.to_inject.items())
 | 
			
		||||
    def __exit__(self, exc_type, exc_val, tb):
 | 
			
		||||
        # Do some real exceptorin'
 | 
			
		||||
        if exc_type is not None: return
 | 
			
		||||
        for k in self.injected:
 | 
			
		||||
            del self.inject_into[k]
 | 
			
		||||
        self.dead = True
 | 
			
		||||
@ -137,17 +139,27 @@ class _Block(object):
 | 
			
		||||
        inj = self.stack[-1].injectors
 | 
			
		||||
        [inj.remove(i) for i in inj if i.dead]
 | 
			
		||||
    def push_ctx(self):
 | 
			
		||||
        # Move most recent active injector to new context
 | 
			
		||||
        self.clean_injectors()
 | 
			
		||||
        last_inj = self.stack[-1].injectors.pop()
 | 
			
		||||
        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [],
 | 
			
		||||
                          [last_inj]))
 | 
			
		||||
        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
 | 
			
		||||
        # The only reason we should have no injectors in the previous block is
 | 
			
		||||
        # if we are hitting a new ptx_func entry point or global declaration at
 | 
			
		||||
        # PTX module scope, which means the stack only contains the outer
 | 
			
		||||
        # context and the current one (i.e. len(stack) == 2)
 | 
			
		||||
        if len(self.stack[-2].injectors) == 0:
 | 
			
		||||
            assert len(self.stack) == 2, "Empty injector list too early!"
 | 
			
		||||
        # Otherwise, the active injector in the previous block is the one for
 | 
			
		||||
        # the Python function which is currently creating a new PTX block, and
 | 
			
		||||
        # and it needs to be promoted to the current block
 | 
			
		||||
        else:
 | 
			
		||||
            self.stack[-1].injectors.append(self.stack[-2].injectors.pop())
 | 
			
		||||
    def pop_ctx(self):
 | 
			
		||||
        self.clean_injectors()
 | 
			
		||||
        bs = self.stack.pop()
 | 
			
		||||
        # TODO: figure out why this next line is needed
 | 
			
		||||
        [bs.injectors.remove(i) for i in bs.injectors if i.dead]
 | 
			
		||||
        self.stack[-1].code.extend(bs.code)
 | 
			
		||||
        if len(self.stack) == 1:
 | 
			
		||||
            # We're on outer_ctx, so all injectors should be gone
 | 
			
		||||
            # We're on outer_ctx, so all injectors should be gone.
 | 
			
		||||
            assert len(bs.injectors) == 0, "Injector/context mismatch"
 | 
			
		||||
            return
 | 
			
		||||
        # The only injector should be the one added in push_ctx
 | 
			
		||||
@ -186,7 +198,7 @@ class _Block(object):
 | 
			
		||||
        spacing. To keep things simple, nested lists and tuples will be reduced
 | 
			
		||||
        in this manner (but not other iterable types). Coercion will not happen
 | 
			
		||||
        until after the entire DSL call tree has been walked. This allows a
 | 
			
		||||
        class to submit a mutable type (e.g. the trivial `StrVar`) when first
 | 
			
		||||
        class to submit a mutable type (e.g. ``DelayVar``) when first
 | 
			
		||||
        walked with an undefined value, then substitute the correct value on
 | 
			
		||||
        being finalized.
 | 
			
		||||
 | 
			
		||||
@ -196,14 +208,23 @@ class _Block(object):
 | 
			
		||||
        """
 | 
			
		||||
        self.stack[-1].code.append(PTXStmt(prefix, op, vars, semi, indent))
 | 
			
		||||
 | 
			
		||||
class StrVar(object):
 | 
			
		||||
class DelayVar(object):
 | 
			
		||||
    """
 | 
			
		||||
    Trivial wrapper to allow deferred variable substitution.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, val=None):
 | 
			
		||||
        self.val = val
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return str(val)
 | 
			
		||||
        return str(self.val)
 | 
			
		||||
    def __mul__(self, other):
 | 
			
		||||
        # Oh this is truly egregious
 | 
			
		||||
        return DelayVarProxy(self, "self.other.val*" + str(other))
 | 
			
		||||
 | 
			
		||||
class DelayVarProxy(object):
 | 
			
		||||
    def __init__(self, other, expr):
 | 
			
		||||
        self.other, self.expr = other, expr
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return str(eval(self.expr))
 | 
			
		||||
 | 
			
		||||
class _PTXFuncWrapper(object):
 | 
			
		||||
    """Enables ptx_func"""
 | 
			
		||||
@ -298,6 +319,9 @@ class Block(object):
 | 
			
		||||
            self.block.code(op=['// ', self.comment], semi=False)
 | 
			
		||||
        self.comment = None
 | 
			
		||||
    def __exit__(self, exc_type, exc_value, tb):
 | 
			
		||||
        # Allow exceptions to be propagated; things get really messy if we try
 | 
			
		||||
        # to pop the stack if things aren't ordered correctly
 | 
			
		||||
        if exc_type is not None: return
 | 
			
		||||
        self.block.code(indent=-1)
 | 
			
		||||
        self.block.code(op='}', semi=False)
 | 
			
		||||
        self.block.pop_ctx()
 | 
			
		||||
@ -370,12 +394,14 @@ class Op(_CallChain):
 | 
			
		||||
    """
 | 
			
		||||
    def _call(self, op, *args, **kwargs):
 | 
			
		||||
        pred = ''
 | 
			
		||||
        if 'ifp' in kwargs:
 | 
			
		||||
            if 'ifnotp' in kwargs:
 | 
			
		||||
        ifp = kwargs.get('ifp')
 | 
			
		||||
        ifnotp = kwargs.get('ifnotp')
 | 
			
		||||
        if ifp:
 | 
			
		||||
            if ifnotp:
 | 
			
		||||
                raise SyntaxError("can't use both, fool")
 | 
			
		||||
            pred = ['@', kwargs['ifp']]
 | 
			
		||||
        if 'ifnotp' in kwargs:
 | 
			
		||||
            pred = ['@!', kwargs['ifnotp']]
 | 
			
		||||
            pred = ['@', ifp]
 | 
			
		||||
        if ifnotp:
 | 
			
		||||
            pred = ['@!', ifnotp]
 | 
			
		||||
        self.block.code(pred, '.'.join(op), _softjoin(args, ','))
 | 
			
		||||
 | 
			
		||||
class Mem(object):
 | 
			
		||||
@ -421,7 +447,7 @@ class Mem(object):
 | 
			
		||||
        >>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
 | 
			
		||||
        >>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
 | 
			
		||||
        """
 | 
			
		||||
        return ['[', areg, aoffset and '+' or '', aoffset, ']']
 | 
			
		||||
        return ['[', areg, aoffset is not '' and '+' or '', aoffset, ']']
 | 
			
		||||
 | 
			
		||||
class _MemFactory(_CallChain):
 | 
			
		||||
    """Actual `mem` object"""
 | 
			
		||||
@ -538,8 +564,8 @@ class PTXFragment(object):
 | 
			
		||||
        """
 | 
			
		||||
        Called after running all PTX DSL functions, but before code generation,
 | 
			
		||||
        to allow fragments which postponed variable evaluation (e.g. using
 | 
			
		||||
        `StrVar`) to fill in the resulting values. Most fragments should not
 | 
			
		||||
        use this.
 | 
			
		||||
        ``DelayVar``) to fill in the resulting values. Most fragments should
 | 
			
		||||
        not use this.
 | 
			
		||||
 | 
			
		||||
        If implemented, this function *may* use an @ptx_func decorator to
 | 
			
		||||
        access the global DSL scope, but pretty please don't emit any code
 | 
			
		||||
@ -796,6 +822,13 @@ class PTXModule(object):
 | 
			
		||||
                raise ValueError("Too many recompiles scheduled!")
 | 
			
		||||
            self.__needs_recompilation = True
 | 
			
		||||
 | 
			
		||||
    def print_source(self):
 | 
			
		||||
        if not hasattr(self, 'source'):
 | 
			
		||||
            raise ValueError("Not assembled yet!")
 | 
			
		||||
        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
 | 
			
		||||
                        enumerate(self.source.split('\n'))])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _flatten(val):
 | 
			
		||||
    if isinstance(val, (list, tuple)):
 | 
			
		||||
        return ''.join(map(_flatten, val))
 | 
			
		||||
@ -806,7 +839,7 @@ class PTXFormatter(object):
 | 
			
		||||
    Formats PTXStmt items into beautiful code. Well, the beautiful part is
 | 
			
		||||
    postponed for now.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, indent_amt=2, oplen_max=20, varlen_max=12):
 | 
			
		||||
    def __init__(self, indent_amt=4, oplen_max=20, varlen_max=12):
 | 
			
		||||
        self.idamt, self.opm, self.vm = indent_amt, oplen_max, varlen_max
 | 
			
		||||
    def format(self, code):
 | 
			
		||||
        out = []
 | 
			
		||||
@ -844,7 +877,7 @@ class PTXFormatter(object):
 | 
			
		||||
_TExp = namedtuple('_TExp', 'type exprlist')
 | 
			
		||||
_DataCell = namedtuple('_DataCell', 'offset size texp')
 | 
			
		||||
 | 
			
		||||
class DataStream(object):
 | 
			
		||||
class DataStream(PTXFragment):
 | 
			
		||||
    """
 | 
			
		||||
    Simple interface between Python and PTX, designed to create and tightly
 | 
			
		||||
    pack control structs.
 | 
			
		||||
@ -914,19 +947,19 @@ class DataStream(object):
 | 
			
		||||
        self.cells = []
 | 
			
		||||
        self.stream_size = 0
 | 
			
		||||
        self.free = {}
 | 
			
		||||
        self.size_strvar = StrVar("not_yet_determined")
 | 
			
		||||
        self.size_delayvars = []
 | 
			
		||||
 | 
			
		||||
    _types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f',
 | 
			
		||||
                  s64='l', u64='L', f64='d')
 | 
			
		||||
    def _get_type(self, *regs):
 | 
			
		||||
    def _get_type(self, regs):
 | 
			
		||||
        size = int(regs[0].type[1:])
 | 
			
		||||
        for r in regs:
 | 
			
		||||
        for reg in regs:
 | 
			
		||||
            if reg.type not in self._types:
 | 
			
		||||
                raise TypeError("Register %s of type %s not supported" %
 | 
			
		||||
                                (reg.name, reg.type))
 | 
			
		||||
            if int(r.type[1:]) != size:
 | 
			
		||||
            if int(reg.type[1:]) != size:
 | 
			
		||||
                raise TypeError("Can't vector-load different size regs")
 | 
			
		||||
        return size, ''.join([self._types.get(r.type) for r in regs])
 | 
			
		||||
        return size/8, ''.join([self._types.get(r.type) for r in regs])
 | 
			
		||||
 | 
			
		||||
    def _alloc(self, vsize, texp):
 | 
			
		||||
        # A really crappy allocator. May later include optimizations for
 | 
			
		||||
@ -939,7 +972,7 @@ class DataStream(object):
 | 
			
		||||
        if idx is None:
 | 
			
		||||
            # No aligned free cells, allocate a new `align`-byte free cell
 | 
			
		||||
            assert alloc not in self.free
 | 
			
		||||
            self.free[alloc] = idx = len(self.stream_size)
 | 
			
		||||
            self.free[alloc] = idx = len(self.cells)
 | 
			
		||||
            self.cells.append(_DataCell(self.stream_size, alloc, None))
 | 
			
		||||
            self.stream_size += alloc
 | 
			
		||||
        # Overwrite the free cell at `idx` with texp
 | 
			
		||||
@ -958,27 +991,28 @@ class DataStream(object):
 | 
			
		||||
                self.cells.insert(fidx, _DataCell(foffset, fsize, None))
 | 
			
		||||
                foffset += fsize
 | 
			
		||||
                self.free[fsize] = fidx
 | 
			
		||||
            fsize *= 2
 | 
			
		||||
        # Adjust indexes. This is ugly, but evidently unavoidable
 | 
			
		||||
        if fidx-idx:
 | 
			
		||||
            for k, v in filter(lambda k, v: v > idx, self.free.items()):
 | 
			
		||||
            for k, v in filter(lambda (k, v): v > idx, self.free.items()):
 | 
			
		||||
                self.free[k] = v+(fidx-idx)
 | 
			
		||||
        return self.offset
 | 
			
		||||
        return offset
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp):
 | 
			
		||||
        size, type = self._get_type(dregs)
 | 
			
		||||
        vsize = size * len(dregs)
 | 
			
		||||
        texp = _TExp(type, [expr])
 | 
			
		||||
        if texp in self.expr_map:
 | 
			
		||||
        texp = _TExp(type, tuple(exprs))
 | 
			
		||||
        if texp in self.texp_map:
 | 
			
		||||
            offset = self.texp_map[texp]
 | 
			
		||||
        else:
 | 
			
		||||
            offset = self._alloc(vsize, texp)
 | 
			
		||||
            self.texp_map[texp] = offset
 | 
			
		||||
        vtype = {1: '', 2: '.v2', 4: '.v4'}.get(len(dregs))
 | 
			
		||||
        if len(dregs) > 0:
 | 
			
		||||
        opname = ['ldu', 'b%d' % (size*8)]
 | 
			
		||||
        if len(dregs) > 1:
 | 
			
		||||
            opname.insert(1, 'v%d' % len(dregs))
 | 
			
		||||
            dregs = vec(dregs)
 | 
			
		||||
        op._call('ldu%s.b%d' % (vtype, size), dregs, addr(areg+off),
 | 
			
		||||
                 ifp=ifp, ifnotp=ifnotp)
 | 
			
		||||
        op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def _stream_get(self, areg, dreg, expr, ifp=None, ifnotp=None):
 | 
			
		||||
@ -991,16 +1025,20 @@ class DataStream(object):
 | 
			
		||||
                                  ifp, ifnotp)
 | 
			
		||||
 | 
			
		||||
    @ptx_func
 | 
			
		||||
    def _stream_get_v2(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
 | 
			
		||||
    def _stream_get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
 | 
			
		||||
                       ifp=None, ifnotp=None):
 | 
			
		||||
        self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4],
 | 
			
		||||
                                  ifp, ifnotp)
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def _stream_size(self):
 | 
			
		||||
        return self.size_strvar
 | 
			
		||||
        x = DelayVar("not_yet_determined")
 | 
			
		||||
        self.size_delayvars.append(x)
 | 
			
		||||
        return x
 | 
			
		||||
 | 
			
		||||
    def finalize_code(self):
 | 
			
		||||
        self.size_strvar.val = str(self.stream_size)
 | 
			
		||||
        for dv in self.size_delayvars:
 | 
			
		||||
            dv.val = self.stream_size
 | 
			
		||||
 | 
			
		||||
    def to_inject(self):
 | 
			
		||||
        return {self.prefix + '_stream_get': self._stream_get,
 | 
			
		||||
@ -1039,9 +1077,20 @@ class DataStream(object):
 | 
			
		||||
        for offset, size, texp in self.cells:
 | 
			
		||||
            if texp:
 | 
			
		||||
                type = texp.type
 | 
			
		||||
                vals = [eval(e, globals(), kwargs) for e in texp.expr_list]
 | 
			
		||||
                vals = [eval(e, globals(), kwargs) for e in texp.exprlist]
 | 
			
		||||
            else:
 | 
			
		||||
                type = 'x'*size # Padding bytes
 | 
			
		||||
                vals = []
 | 
			
		||||
            out.write(struct.pack(type, *vals))
 | 
			
		||||
            outfile.write(struct.pack(type, *vals))
 | 
			
		||||
 | 
			
		||||
    def print_record(self):
 | 
			
		||||
        for cell in self.cells:
 | 
			
		||||
            if cell.texp is None:
 | 
			
		||||
                print '%3d %2d --' % (cell.offset, cell.size)
 | 
			
		||||
                continue
 | 
			
		||||
            print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
 | 
			
		||||
                                      cell.texp.exprlist[0])
 | 
			
		||||
            for exp in cell.texp.exprlist[1:]:
 | 
			
		||||
                print '%12s %s' % ('', exp)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,62 @@
 | 
			
		||||
 | 
			
		||||
from ctypes import *
 | 
			
		||||
from cStringIO import StringIO
 | 
			
		||||
import numpy as np
 | 
			
		||||
from fr0stlib.pyflam3 import Genome, Frame
 | 
			
		||||
 | 
			
		||||
from fr0stlib import pyflam3
 | 
			
		||||
from fr0stlib.pyflam3._flam3 import *
 | 
			
		||||
from fr0stlib.pyflam3.constants import *
 | 
			
		||||
 | 
			
		||||
from cuburnlib.cuda import LaunchContext
 | 
			
		||||
from cuburnlib.device_code import IterThread, CPDataStream
 | 
			
		||||
 | 
			
		||||
Point = lambda x, y: np.array([x, y], dtype=np.double)
 | 
			
		||||
 | 
			
		||||
class Genome(pyflam3.Genome):
 | 
			
		||||
    pass
 | 
			
		||||
 | 
			
		||||
class Frame(pyflam3.Frame):
 | 
			
		||||
    def interpolate(self, time, cp):
 | 
			
		||||
        flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
 | 
			
		||||
 | 
			
		||||
    def pack_stream(self, ctx, time):
 | 
			
		||||
        """
 | 
			
		||||
        Pack and return the control point data stream to render this frame.
 | 
			
		||||
        """
 | 
			
		||||
        # Get the central control point, and calculate parameters that change
 | 
			
		||||
        # once per frame
 | 
			
		||||
        cp = BaseGenome()
 | 
			
		||||
        self.interpolate(time, cp)
 | 
			
		||||
        self.filt = Filters(self, cp)
 | 
			
		||||
        rw = cp.spatial_oversample * cp.width  + 2 * self.filt.gutter
 | 
			
		||||
        rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
 | 
			
		||||
 | 
			
		||||
        # Interpolate each time step, calculate per-step variables, and pack
 | 
			
		||||
        # into the stream
 | 
			
		||||
        cp_streamer = ctx.ptx.instances[CPDataStream]
 | 
			
		||||
        stream = StringIO()
 | 
			
		||||
        print "Data stream contents:"
 | 
			
		||||
        cp_streamer.print_record()
 | 
			
		||||
        tcp = BaseGenome()
 | 
			
		||||
        for batch_idx in range(cp.nbatches):
 | 
			
		||||
            for time_idx in range(cp.ntemporal_samples):
 | 
			
		||||
                idx = time_idx + batch_idx * cp.nbatches
 | 
			
		||||
                cp_time = time + self.filt.temporal_deltas[idx]
 | 
			
		||||
                self.interpolate(time, tcp)
 | 
			
		||||
                tcp.camera = Camera(self, tcp, self.filt)
 | 
			
		||||
 | 
			
		||||
                # TODO: figure out which object to pack this into
 | 
			
		||||
                nsamples = ((tcp.camera.sample_density * cp.width * cp.height) /
 | 
			
		||||
                            (cp.nbatches * cp.ntemporal_samples))
 | 
			
		||||
                samples_per_thread = nsamples / ctx.threads + 15
 | 
			
		||||
 | 
			
		||||
                cp_streamer.pack_into(stream,
 | 
			
		||||
                        frame=self,
 | 
			
		||||
                        cp=tcp,
 | 
			
		||||
                        cp_idx=idx,
 | 
			
		||||
                        samples_per_thread=samples_per_thread)
 | 
			
		||||
        stream.seek(0)
 | 
			
		||||
        return (stream.read(), cp.nbatches * cp.ntemporal_samples)
 | 
			
		||||
 | 
			
		||||
class Animation(object):
 | 
			
		||||
    """
 | 
			
		||||
    Control structure for rendering a series of frames.
 | 
			
		||||
@ -31,46 +81,46 @@ class Animation(object):
 | 
			
		||||
            memmove(byref(self.genomes[i]), byref(genomes[i]),
 | 
			
		||||
                    sizeof(BaseGenome))
 | 
			
		||||
 | 
			
		||||
        self._frame = Frame()
 | 
			
		||||
        self._frame.genomes = cast(self.genomes, POINTER(BaseGenome))
 | 
			
		||||
        self._frame.ngenomes = len(genomes)
 | 
			
		||||
        self.features = Features(genomes)
 | 
			
		||||
        self.frame = Frame()
 | 
			
		||||
        self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
 | 
			
		||||
        self.frame.ngenomes = len(genomes)
 | 
			
		||||
 | 
			
		||||
        self.ctx = None
 | 
			
		||||
 | 
			
		||||
    def compile(self):
 | 
			
		||||
        """
 | 
			
		||||
        Create a PTX kernel optimized for this animation, compile it, and
 | 
			
		||||
        attach it to a LaunchContext with a thread distribution optimized for
 | 
			
		||||
        the active device.
 | 
			
		||||
        """
 | 
			
		||||
        # TODO: user-configurable test control
 | 
			
		||||
        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
 | 
			
		||||
                                 tests=True)
 | 
			
		||||
        # TODO: user-configurable verbosity control
 | 
			
		||||
        self.ctx.compile(verbose=3, anim=self, features=self.features)
 | 
			
		||||
        # TODO: automatic optimization of block parameters
 | 
			
		||||
 | 
			
		||||
    def render_frame(self, time=0):
 | 
			
		||||
        # TODO: support more nuanced frame control than just 'time'
 | 
			
		||||
        # TODO: reuse more information between frames
 | 
			
		||||
        # TODO: allow animation-long override of certain parameters (size, etc)
 | 
			
		||||
        cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
 | 
			
		||||
        iter_thread = self.ctx.ptx.instances[IterThread]
 | 
			
		||||
        iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
 | 
			
		||||
        iter_thread.call(self.ctx)
 | 
			
		||||
 | 
			
		||||
        cp = BaseGenome()
 | 
			
		||||
        flam3_interpolate(self.frame.genomes, len(self.genomes), time, 0,
 | 
			
		||||
                          byref(cp))
 | 
			
		||||
        filt = Filters(self.frame, cp)
 | 
			
		||||
        rw = cp.spatial_oversample * cp.width  + 2 * filt.gutter
 | 
			
		||||
        rh = cp.spatial_oversample * cp.height + 2 * filt.gutter
 | 
			
		||||
class Features(object):
 | 
			
		||||
    """
 | 
			
		||||
    Determine features and constants required to render a particular set of
 | 
			
		||||
    genomes. The values of this class are fixed before compilation begins.
 | 
			
		||||
    """
 | 
			
		||||
    # Constant; number of rounds spent fusing points on first CP of a frame
 | 
			
		||||
    num_fuse_samples = 25
 | 
			
		||||
 | 
			
		||||
        # Allocate buckets, accumulator
 | 
			
		||||
        # Loop over all batches:
 | 
			
		||||
        #   [density estimation]
 | 
			
		||||
        #   Loop over all temporal samples:
 | 
			
		||||
        #     Color scalar = temporal filter at index
 | 
			
		||||
        #     Interpolate and get control point
 | 
			
		||||
        #     Precalculate
 | 
			
		||||
        #     Prepare xforms
 | 
			
		||||
        #     Compute colormap
 | 
			
		||||
        #     Run iterations
 | 
			
		||||
        #     Accumulate vibrancy, gamma, background
 | 
			
		||||
        #   Calculate k1, k2
 | 
			
		||||
        #   If not DE, then do log filtering to accumulator
 | 
			
		||||
        #   Else, [density estimation]
 | 
			
		||||
        # Do final clip and filter
 | 
			
		||||
 | 
			
		||||
        # For now:
 | 
			
		||||
        # Loop over all batches:
 | 
			
		||||
        #   Loop over all temporal samples:
 | 
			
		||||
        #     Interpolate and get control point
 | 
			
		||||
        #     Read the
 | 
			
		||||
        #     Dump noise into buckets
 | 
			
		||||
        #   Do log filtering to accumulator
 | 
			
		||||
        # Do simplified final clip
 | 
			
		||||
    def __init__(self, genomes):
 | 
			
		||||
        self.max_ntemporal_samples = max(
 | 
			
		||||
                [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
 | 
			
		||||
 | 
			
		||||
class Filters(object):
 | 
			
		||||
    def __init__(self, frame, cp):
 | 
			
		||||
@ -115,7 +165,7 @@ class Camera(object):
 | 
			
		||||
        scale = 2.0 ** cp.zoom
 | 
			
		||||
        self.sample_density = cp.sample_density * scale * scale
 | 
			
		||||
 | 
			
		||||
        center = Point(cp.center[0], cp.center[1])
 | 
			
		||||
        center = Point(cp._center[0], cp._center[1])
 | 
			
		||||
        size = Point(cp.width, cp.height)
 | 
			
		||||
        # pix per unit, where 'unit' is '1.0' in IFS space
 | 
			
		||||
        self.ppu = Point(
 | 
			
		||||
@ -129,4 +179,3 @@ class Camera(object):
 | 
			
		||||
        self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
 | 
			
		||||
        # TODO: coordinate transforms in concert with GPU (rotation, size)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user