mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Known broken checkin to show algorias
This commit is contained in:
parent
cceb75396f
commit
5f8c2bbf08
@ -1,3 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Contains the PTX fragments which will drive the device.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -6,60 +10,147 @@ import numpy as np
|
|||||||
|
|
||||||
from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest
|
from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest
|
||||||
|
|
||||||
|
"""
|
||||||
|
Here's the current draft of the full algorithm implementation.
|
||||||
|
|
||||||
|
declare xform jump table
|
||||||
|
|
||||||
|
load random state
|
||||||
|
|
||||||
|
clear x_coord, y_coord, z_coord, w_coord;
|
||||||
|
store -(FUSE+1) to shared (per-warp) num_samples_sh
|
||||||
|
clear badvals [1]
|
||||||
|
|
||||||
|
load param (global_cp_idx_addr)
|
||||||
|
index table start (global_cp_idx) [2]
|
||||||
|
load count of indexes from global cp index =>
|
||||||
|
store to qlocal current_cp_num [3]
|
||||||
|
|
||||||
|
outermost loop start:
|
||||||
|
load current_cp_num
|
||||||
|
if current_cp_num <= 0:
|
||||||
|
exit
|
||||||
|
|
||||||
|
load param global_cp_idx_addr
|
||||||
|
calculate offset into address with current_cp_num, global_cp_idx_addr
|
||||||
|
load cp_base_address
|
||||||
|
stream_start (cp_base, cp_base_addr) [4]
|
||||||
|
|
||||||
|
FUSE_START:
|
||||||
|
num_samples += 1
|
||||||
|
if num_samples >= 0:
|
||||||
|
# Okay, we're done FUSEing, prepare to enter normal loop
|
||||||
|
load num_samples => store to shared (per-warp) num_samples
|
||||||
|
|
||||||
|
|
||||||
|
ITER_LOOP_START:
|
||||||
|
reg xform_addr, xform_stream_addr, xform_select
|
||||||
|
|
||||||
|
mwc_next_u32 to xform_select
|
||||||
|
# Performance test: roll/unroll this loop?
|
||||||
|
stream_load xform_prob (cp_stream)
|
||||||
|
if xform_select <= xform_prob:
|
||||||
|
bra.uni XFORM_1_LBL
|
||||||
|
...
|
||||||
|
stream_load xform_prob (cp_stream)
|
||||||
|
if xform_select <= xform_prob:
|
||||||
|
bra.uni XFORM_N_LBL
|
||||||
|
|
||||||
|
XFORM_1_LBL:
|
||||||
|
stream_load xform_1_ (cp_stream)
|
||||||
|
...
|
||||||
|
bra.uni XFORM_POST
|
||||||
|
|
||||||
|
XFORM_POST:
|
||||||
|
[if final_xform:]
|
||||||
|
[do final_xform]
|
||||||
|
|
||||||
|
if num_samples < 0:
|
||||||
|
# FUSE still in progress
|
||||||
|
bra.uni FUSE_START
|
||||||
|
|
||||||
|
FRAGMENT_WRITEBACK:
|
||||||
|
# Unknown at this time.
|
||||||
|
|
||||||
|
SHUFFLE:
|
||||||
|
# Unknown at this time.
|
||||||
|
|
||||||
|
load num_samples from num_samples_sh
|
||||||
|
num_samples -= 1
|
||||||
|
if num_samples > 0:
|
||||||
|
bra.uni ITER_LOOP_START
|
||||||
|
|
||||||
|
|
||||||
|
[1] Tracking 'badvals' can put a pretty large hit on performance, particularly
|
||||||
|
for images that sample a small amount of the grid. So this might be cut
|
||||||
|
when rendering for performance. On the other hand, it might actually help
|
||||||
|
tune the algorithm later, so it'll definitely be an option.
|
||||||
|
|
||||||
|
[2] Control points for each temporal sample will be preloaded to the
|
||||||
|
device in the compact DataStream format (more on this later). Their
|
||||||
|
locations are represented in an index table, which starts with a single
|
||||||
|
`.u32 length`, followed by `length` pointers. To avoid having to keep
|
||||||
|
reloading `length`, or worse, using a register to hold it in memory, we
|
||||||
|
instead count *down* to zero. This is a very common idiom.
|
||||||
|
|
||||||
|
[3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
|
||||||
|
depending on how local storage is implemented, but the extra 128-byte loads
|
||||||
|
for such values might make a performance difference. qlocal variables may
|
||||||
|
be identical across a warp or even a CTA, and so variables noted as
|
||||||
|
"qlocal" here might end up in shared memory or even a small per-warp or
|
||||||
|
per-CTA buffer in global memory created specifically for this purpose,
|
||||||
|
after benchmarking is done.
|
||||||
|
|
||||||
|
[4] DataStreams are "opaque" data serialization structures defined below. The
|
||||||
|
structure of a stream is actually created while parsing the DSL by the load
|
||||||
|
statements themselves. Some benchmarks need to be done before DataStreams
|
||||||
|
stop being "opaque" and become simply "dynamic".
|
||||||
|
"""
|
||||||
|
|
||||||
class MWCRNG(PTXFragment):
|
class MWCRNG(PTXFragment):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.threads_ready = 0
|
self.threads_ready = 0
|
||||||
if not os.path.isfile('primes.bin'):
|
if not os.path.isfile('primes.bin'):
|
||||||
raise EnvironmentError('primes.bin not found')
|
raise EnvironmentError('primes.bin not found')
|
||||||
|
|
||||||
prelude = (".global .u32 mwc_rng_mults[{{ctx.threads}}];\n"
|
def module_setup(self):
|
||||||
".global .u64 mwc_rng_state[{{ctx.threads}}];")
|
mem.global_.u32('mwc_rng_mults', ctx.threads)
|
||||||
|
mem.global_.u32('mwc_rng_state', ctx.threads)
|
||||||
|
|
||||||
def _next_b32(self, dreg):
|
def entry_setup(self):
|
||||||
# TODO: make sure PTX optimizes away superfluous move instrs
|
reg.u32('mwc_st mwc_mult mwc_car')
|
||||||
return """
|
with block('Load MWC multipliers and states'):
|
||||||
{
|
reg.u32('mwc_off mwc_addr')
|
||||||
// MWC next b32
|
get_gtid(mwc_off)
|
||||||
.reg .u64 mwc_out;
|
op.mov.u32(mwc_addr, mwc_rng_mults)
|
||||||
cvt.u64.u32 mwc_out, mwc_car;
|
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
|
||||||
mad.wide.u32 mwc_out, mwc_st, mwc_mult, mwc_out;
|
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
|
||||||
mov.b64 {mwc_st, mwc_car}, mwc_out;
|
|
||||||
mov.u32 %s, mwc_st;
|
|
||||||
}
|
|
||||||
""" % dreg
|
|
||||||
|
|
||||||
def subs(self, ctx):
|
op.mov.u32(mwc_addr, mwc_rng_state)
|
||||||
return {'mwc_next_b32': self._next_b32}
|
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
||||||
|
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
|
||||||
|
|
||||||
entry_start = """
|
def entry_teardown(self):
|
||||||
.reg .u32 mwc_st, mwc_mult, mwc_car;
|
with block('Save MWC states'):
|
||||||
{
|
reg.u32('mwc_off mwc_addr')
|
||||||
// MWC load multipliers and RNG states
|
get_gtid(mwc_off)
|
||||||
.reg .u32 mwc_off, mwc_addr;
|
op.mov.u32(mwc_addr, mwc_rng_state)
|
||||||
{{ get_gtid('mwc_off') }}
|
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
||||||
mov.u32 mwc_addr, mwc_rng_mults;
|
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
|
||||||
mad.lo.u32 mwc_addr, mwc_off, 4, mwc_addr;
|
|
||||||
ld.global.u32 mwc_mult, [mwc_addr];
|
|
||||||
mov.u32 mwc_addr, mwc_rng_state;
|
|
||||||
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
|
|
||||||
ld.global.v2.u32 {mwc_st, mwc_car}, [mwc_addr];
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
entry_end = """
|
def next_b32(self, dst_reg):
|
||||||
{
|
with block('Load next random into ' + dst_reg.name):
|
||||||
// MWC save states
|
reg.u64('mwc_out')
|
||||||
.reg .u32 mwc_addr, mwc_off;
|
op.cvt.u64.u32(mwc_out, mwc_car)
|
||||||
{{ get_gtid('mwc_off') }}
|
mad.wide.u32(mwc_out, mwc_st)
|
||||||
mov.u32 mwc_addr, mwc_rng_state;
|
mov.b64(vec(mwc_st, mwc_car), mwc_out)
|
||||||
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
|
mov.u32(dst_reg, mwc_st)
|
||||||
st.global.v2.u32 [mwc_addr], {mwc_st, mwc_car};
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
def set_up(self, ctx):
|
def set_up(self, ctx):
|
||||||
if self.threads_ready >= ctx.threads:
|
if self.threads_ready >= ctx.threads:
|
||||||
|
# Already set up enough random states, don't push again
|
||||||
return
|
return
|
||||||
|
|
||||||
# Load raw big-endian u32 multipliers from primes.bin.
|
# Load raw big-endian u32 multipliers from primes.bin.
|
||||||
with open('primes.bin') as primefp:
|
with open('primes.bin') as primefp:
|
||||||
dt = np.dtype(np.uint32).newbyteorder('B')
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
||||||
@ -87,34 +178,35 @@ class MWCRNGTest(PTXTest):
|
|||||||
name = "MWC RNG sum-of-threads"
|
name = "MWC RNG sum-of-threads"
|
||||||
deps = [MWCRNG]
|
deps = [MWCRNG]
|
||||||
rounds = 10000
|
rounds = 10000
|
||||||
|
entry_name = 'MWC_RNG_test'
|
||||||
|
entry_params = ''
|
||||||
|
|
||||||
prelude = ".global .u64 mwc_rng_test_sums[{{ctx.threads}}];"
|
def module_setup(self):
|
||||||
|
mem.global_.u64(mwc_rng_test_sums, ctx.threads)
|
||||||
|
|
||||||
def entry(self, ctx):
|
@ptx_func
|
||||||
return ('MWC_RNG_test', '', """
|
def entry(self):
|
||||||
.reg .u64 sum, addl;
|
reg.u64('sum addl')
|
||||||
.reg .u32 addend;
|
reg.u32('addend')
|
||||||
mov.u64 sum, 0;
|
op.mov.u64(sum, 0)
|
||||||
{
|
with block('Sum next %d random numbers' % self.rounds):
|
||||||
.reg .u32 loopct;
|
reg.u32('loopct')
|
||||||
.reg .pred p;
|
pred('p')
|
||||||
mov.u32 loopct, %s;
|
op.mov.u32(loopct, self.rounds)
|
||||||
loopstart:
|
label('loopstart')
|
||||||
{{ mwc_next_b32('addend') }}
|
mwc_next_b32(addend)
|
||||||
cvt.u64.u32 addl, addend;
|
op.cvt.u64.u32(addl, addend)
|
||||||
add.u64 sum, sum, addl;
|
op.add.u64(sum, sum, addl)
|
||||||
sub.u32 loopct, loopct, 1;
|
op.sub.u32(loopct, loopct, 1)
|
||||||
setp.gt.u32 p, loopct, 0;
|
op.setp.gt.u32(p, loopct, 0)
|
||||||
@p bra.uni loopstart;
|
op.bra.uni(loopstart, ifp=p)
|
||||||
}
|
|
||||||
{
|
with block('Store sum and state'):
|
||||||
.reg .u32 addr, offset;
|
reg.u32('adr offset')
|
||||||
{{ get_gtid('offset') }}
|
get_gtid(offset)
|
||||||
mov.u32 addr, mwc_rng_test_sums;
|
op.mov.u32(adr, mwc_rng_test_sums)
|
||||||
mad.lo.u32 addr, offset, 8, addr;
|
op.mad.lo.u32(adr, offset, 8, adr)
|
||||||
st.global.u64 [addr], sum;
|
st.global_.u64(addr(adr), sum)
|
||||||
}
|
|
||||||
""" % self.rounds)
|
|
||||||
|
|
||||||
def call(self, ctx):
|
def call(self, ctx):
|
||||||
# Get current multipliers and seeds from the device
|
# Get current multipliers and seeds from the device
|
||||||
|
632
cuburnlib/ptx.py
632
cuburnlib/ptx.py
@ -1,38 +1,545 @@
|
|||||||
|
"""
|
||||||
|
PTX DSL, a domain-specific language for NVIDIA's PTX.
|
||||||
|
|
||||||
|
The DSL doesn't really provide any benefits over raw PTX in terms of type
|
||||||
|
safety or error checking. Where it shines is in enabling code reuse,
|
||||||
|
modularization, and dynamic data structures. In particular, the "data stream"
|
||||||
|
that controls the iterations and xforms in cuflame's device code are much
|
||||||
|
easier to maintain using this system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# If you see 'import inspect', you know you're in for a good time
|
||||||
|
import inspect
|
||||||
import ctypes
|
import ctypes
|
||||||
import tempita
|
from collections import namedtuple
|
||||||
|
|
||||||
def ppr_ptx(src):
|
# Okay, so here's what's going on.
|
||||||
# TODO: Add variable realignment
|
#
|
||||||
indent = 0
|
# We're using Python to create PTX. If we just use Python to make one giant PTX
|
||||||
out = []
|
# module, there's no real reason of going to the trouble of using Python to
|
||||||
for line in [l.strip() for l in src.split('\n')]:
|
# begin with, as the things that this system is good for - modularization, unit
|
||||||
if not line:
|
# testing, automated analysis, and data structure generation and optimization -
|
||||||
continue
|
# pretty much require splitting code up into manageable units. However,
|
||||||
if len(line.split()) == 1 and line.endswith(':'):
|
# splitting things up at the level of PTX will greatly reduce performance, as
|
||||||
out.append(line)
|
# the cost of accessing the stack, spilling registers, and reloading data from
|
||||||
continue
|
# system memory is unacceptably high even on Fermi GPUs. So we want to split
|
||||||
if '}' in line and '{' not in line:
|
# code up into functions within Python, but not within the PTX.
|
||||||
indent -= 1
|
#
|
||||||
if line.startswith('@'):
|
# The challenge here is variable lifetime. A PTX function might declare a
|
||||||
out.append(' ' * ((indent - 1) * 4) + line)
|
# register at the top of the main block and use it several times throughout the
|
||||||
|
# function. In Python, we split that up into multiple functions, one to declare
|
||||||
|
# the registers at the start of the scope and another to make use of them later
|
||||||
|
# on. This makes it very easy to reuse a class of related PTX functions in
|
||||||
|
# different device entry points, do unit tests, and so on.
|
||||||
|
#
|
||||||
|
# The scope of the class instance is unrelated to the normal scope of names in
|
||||||
|
# Python. In fact, a function call frequently declares a register that may be
|
||||||
|
# needed by the parent function. So where to store the information regarding
|
||||||
|
# the register that was declared at the top of the file (name, type, etc)?
|
||||||
|
# Well, once declared, a variable remains in scope in PTX until the closing
|
||||||
|
# brace of the block (curly-braces segment) it was declared in. The natural
|
||||||
|
# place to store it would be in a Pythonic representation of the block: a block
|
||||||
|
# object that implements the context manager.
|
||||||
|
#
|
||||||
|
# This works well in terms of tracking object lifetime, but it adds a great
|
||||||
|
# deal of ugliness to the code. What I originally sought was this::
|
||||||
|
#
|
||||||
|
# def load_zero(dest_reg):
|
||||||
|
# op.mov.u32(dest_reg, 0)
|
||||||
|
# def init_module():
|
||||||
|
# reg.u32('hooray_reg')
|
||||||
|
# load_zero(hooray_reg)
|
||||||
|
#
|
||||||
|
# But using blocks to track state, it would turn in to this ugliness::
|
||||||
|
#
|
||||||
|
# def load_zero(block, dest_reg):
|
||||||
|
# block.op.mov.u32(op.dest_reg, 0)
|
||||||
|
# def init_module():
|
||||||
|
# with Block() as block:
|
||||||
|
# block.regs.hooray_reg = block.reg.u32('hooray_reg')
|
||||||
|
# load_zero(block, block.regs.hooray_reg)
|
||||||
|
#
|
||||||
|
# Eeugh.
|
||||||
|
#
|
||||||
|
# Anyway, never one to use an acceptable solution when an ill-conceived hack
|
||||||
|
# was available, I poked and prodded until I found a way to attain my ideal.
|
||||||
|
# In short, a function with a 'ptx_func' decorator will be wrapped in a
|
||||||
|
# _BlockInjector context manager, which will temporarily add values to the
|
||||||
|
# function's global dictionary in such a way as to mimic the desired behavior.
|
||||||
|
# The decorator is kind enough to pop the values when exiting. The examples
|
||||||
|
# below give a clear picture of how to use it, but now you know why this
|
||||||
|
# abomination was crafted to begin with.
|
||||||
|
|
||||||
|
|
||||||
|
BlockCtx = namedtuple('BlockCtx', 'locals code injectors')
|
||||||
|
PTXStmt = namedtuple('PTXStmt', 'prefix op vars semi indent')
|
||||||
|
|
||||||
|
class _BlockInjector(object):
|
||||||
|
"""
|
||||||
|
A ContextManager that, upon entering a context, loads some keys into a
|
||||||
|
dictionary, and upon leaving it, removes those keys. If any keys are
|
||||||
|
already in the destination dictionary with a different value, an exception
|
||||||
|
is raised.
|
||||||
|
|
||||||
|
Useful if the destination dictionary is a func's __globals__.
|
||||||
|
"""
|
||||||
|
def __init__(self, to_inject, inject_into):
|
||||||
|
self.to_inject, self.inject_into = to_inject, inject_into
|
||||||
|
self.injected = set()
|
||||||
|
self.dead = True
|
||||||
|
def inject(self, kv, v=None):
|
||||||
|
"""Inject a key-value pair (passed either as a tuple or separately.)"""
|
||||||
|
k, v = v and (kv, v) or kv
|
||||||
|
if k not in self.to_inject:
|
||||||
|
self.to_inject[k] = v
|
||||||
|
if self.dead:
|
||||||
|
return
|
||||||
|
if k in self.inject_into:
|
||||||
|
if self.inject_into[k] is not v:
|
||||||
|
raise KeyError("Key with different value already in dest")
|
||||||
else:
|
else:
|
||||||
out.append(' ' * (indent * 4) + line)
|
self.inject_into[k] = v
|
||||||
if '{' in line and '}' not in line:
|
self.injected.add(k)
|
||||||
indent += 1
|
def __enter__(self):
|
||||||
return '\n'.join(out)
|
self.dead = False
|
||||||
|
map(self.inject, self.to_inject.items())
|
||||||
|
def __exit__(self, exc_type, exc_val, tb):
|
||||||
|
for k in self.injected:
|
||||||
|
del self.inject_into[k]
|
||||||
|
self.dead = True
|
||||||
|
|
||||||
def multisub(tmpl, subs):
|
class _Block(object):
|
||||||
while '{{' in tmpl:
|
"""
|
||||||
tmpl = tempita.Template(tmpl).substitute(subs)
|
State-tracker for PTX fragments. You should really look at Block and
|
||||||
return tmpl
|
PTXModule instead of here.
|
||||||
|
|
||||||
class PTXAssembler(object):
|
For important reasons, the instance must be bound locally as "_block".
|
||||||
|
"""
|
||||||
|
name = '_block'
|
||||||
|
def __init__(self):
|
||||||
|
self.outer_ctx = BlockCtx({self.name: self}, [], [])
|
||||||
|
self.stack = [self.outer_ctx]
|
||||||
|
def push_ctx(self):
|
||||||
|
self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
|
||||||
|
def pop_ctx(self):
|
||||||
|
bs = self.stack.pop()
|
||||||
|
self.stack[-1].code.append(bs.code)
|
||||||
|
def injector(self, func_globals):
|
||||||
|
inj = BlockInjector(self.stack[-1].locals, func_globals)
|
||||||
|
self.stack[-1].injectors.append(inj)
|
||||||
|
return inj
|
||||||
|
def inject(self, name, object):
|
||||||
|
if name in self.stack[-1].locals:
|
||||||
|
raise KeyError("Duplicate name already exists in this scope.")
|
||||||
|
self.stack[-1].locals[name] = object
|
||||||
|
[inj.inject(name, object) for inj in self.stack[-1].injectors]
|
||||||
|
def code(self, prefix='', op='', vars=[], semi=True, indent=0):
|
||||||
|
"""
|
||||||
|
Append a PTX statement (or thereabouts) to the current block.
|
||||||
|
|
||||||
|
- `prefix`: a string which will not be indented, regardless of the
|
||||||
|
current indent level, for labels and predicates.
|
||||||
|
- `op`: a string, aligned to current indent level.
|
||||||
|
- `vars`: a list of strings, with best-effort alignment.
|
||||||
|
- `semi`: whether to terminate the current line with a semicolon.
|
||||||
|
- `indent`: integer adjustment to the current indent level.
|
||||||
|
|
||||||
|
For `prefix`, `op`, and `vars`, a "string" can also mean a sequence of
|
||||||
|
objects that can be coerced to strings, which will be joined without
|
||||||
|
spacing. To keep things simple, nested lists and tuples will be reduced
|
||||||
|
in this manner (but not other iterable types). Coercion will not happen
|
||||||
|
until after the entire DSL call tree has been walked. This allows a
|
||||||
|
class to submit a mutable type (e.g. the trivial `StrVar`) when first
|
||||||
|
walked with an undefined value, then substitute the correct value on
|
||||||
|
being finalized.
|
||||||
|
|
||||||
|
Details about alignment are available in the `PTXFormatter` class. And
|
||||||
|
yes, the only real difference between `prefix`, `op`, and `vars` is in
|
||||||
|
final appearance, but it is in fact quite helpful for debugging.
|
||||||
|
"""
|
||||||
|
self.stack[-1].append(PTXStmt(prefix, op, vars, indent))
|
||||||
|
|
||||||
|
class StrVar(object):
|
||||||
|
"""
|
||||||
|
Trivial wrapper to allow deferred variable substitution.
|
||||||
|
"""
|
||||||
|
def __init__(self, val=None):
|
||||||
|
self.val = val
|
||||||
|
def __str__(self):
|
||||||
|
return str(val)
|
||||||
|
|
||||||
|
def ptx_func(func):
|
||||||
|
"""
|
||||||
|
Decorator function for code in the DSL. Any function which accesses the DSL
|
||||||
|
namespace, including declared device variables and objects such as "reg"
|
||||||
|
or "op", should be wrapped with this. See Block for some examples.
|
||||||
|
"""
|
||||||
|
def ptx_eval(*args, **kwargs):
|
||||||
|
if self.name not in globals():
|
||||||
|
parent = inspect.stack()[-2][0]
|
||||||
|
if self.name in parent.f_locals:
|
||||||
|
block = parent.f_locals[self.name]
|
||||||
|
elif self.name in parent.f_globals:
|
||||||
|
block = parent.f_globals[self.name]
|
||||||
|
else:
|
||||||
|
# Couldn't find the _block instance. Fail cryptically to
|
||||||
|
# encourage users to read the source (for now)
|
||||||
|
raise SyntaxError("Black magic")
|
||||||
|
else:
|
||||||
|
block = globals()['block']
|
||||||
|
with block.injector(func.func_globals):
|
||||||
|
func(*args, **kwargs)
|
||||||
|
return ptx_eval
|
||||||
|
|
||||||
|
class Block(object):
|
||||||
|
"""
|
||||||
|
Limits the lifetime of variables in both PTX (using curly-braces) and in
|
||||||
|
the Python DSL (via black magic). This is semantically useful, but should
|
||||||
|
not otherwise affect device code (the lifetime of a register is
|
||||||
|
aggressively minimized by the compiler).
|
||||||
|
|
||||||
|
>>> with block('This comment will appear at the top of the block'):
|
||||||
|
>>> reg.u32('same_name')
|
||||||
|
>>> with block():
|
||||||
|
>>> reg.u64('same_name') # OK, because 'same_name' went out of scope
|
||||||
|
|
||||||
|
PTX variables declared inside a block will be available in any other
|
||||||
|
ptx_func called within that block. Note that this flies in the face of
|
||||||
|
normal Python behavior! That's why it's a DSL. (This doesn't apply to
|
||||||
|
non-PTX variables.)
|
||||||
|
|
||||||
|
>>> @ptx_func
|
||||||
|
>>> def fn1():
|
||||||
|
>>> op.mov.u32(reg1, 0)
|
||||||
|
>>>
|
||||||
|
>>> @ptx_func
|
||||||
|
>>> def fn2():
|
||||||
|
>>> print x
|
||||||
|
>>>
|
||||||
|
>>> @ptx_func
|
||||||
|
>>> def fn3():
|
||||||
|
>>> with block():
|
||||||
|
>>> reg.u32('reg1')
|
||||||
|
>>> x = 4
|
||||||
|
>>> fn1() # OK: DSL magic propagates 'reg1' to fn1's namespace
|
||||||
|
>>> fn2() # FAIL: DSL magic doesn't touch regular variables
|
||||||
|
>>> fn1() # FAIL: 'reg1' went out of scope along with the block
|
||||||
|
|
||||||
|
This constructor is available as 'block' in the DSL namespace.
|
||||||
|
"""
|
||||||
|
def __init__(self, block):
|
||||||
|
# `block` is the real _block
|
||||||
|
self.block = block
|
||||||
|
self.comment = None
|
||||||
|
def __call__(self, comment=None)
|
||||||
|
self.comment = comment
|
||||||
|
return self
|
||||||
|
def __enter__(self):
|
||||||
|
self.block.push_ctx()
|
||||||
|
self.block.code(op='{', indent=4)
|
||||||
|
def __exit__(self, exc_type, exc_value, tb):
|
||||||
|
self.block.code(op='}', indent=-4)
|
||||||
|
self.block.pop_ctx()
|
||||||
|
|
||||||
|
class _CallChain(object):
|
||||||
|
"""Handles the syntax for the operator chaining in PTX, like op.mul.u32."""
|
||||||
|
def __init__(self, block):
|
||||||
|
self.block = block
|
||||||
|
self.__chain = []
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
assert(self.__chain)
|
||||||
|
self._call(chain, *args, **kwargs)
|
||||||
|
self.__chain = []
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name == 'global_':
|
||||||
|
name = 'global'
|
||||||
|
self.chain.append(name)
|
||||||
|
# Another great crime against the universe:
|
||||||
|
return self
|
||||||
|
|
||||||
|
class Reg(object):
|
||||||
|
"""
|
||||||
|
Creates one or more registers. The argument should be a string containing
|
||||||
|
one or more register names, separated by whitespace; the registers will be
|
||||||
|
injected into the DSL namespace on creation, so you do not need to
|
||||||
|
rebind them to the same name before use.
|
||||||
|
|
||||||
|
>>> with block():
|
||||||
|
>>> reg.u32('addend product')
|
||||||
|
>>> op.mov.u32(addend, 0)
|
||||||
|
>>> op.mov.u32(product, 0)
|
||||||
|
>>> op.mov.u32(addend, 1) # Fails, block unbinds globals on leaving scope
|
||||||
|
|
||||||
|
This constructor is available as 'reg' in the DSL namespace.
|
||||||
|
"""
|
||||||
|
def __init__(self, type, name):
|
||||||
|
self.type, self.name = type, name
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
class _RegFactory(_CallChain):
|
||||||
|
"""The actual 'reg' object in the DSL namespace."""
|
||||||
|
def _call(self, type, names):
|
||||||
|
assert len(type) == 1
|
||||||
|
type = type[0]
|
||||||
|
names = names.split()
|
||||||
|
regs = map(lambda n: Reg(type, n), names)
|
||||||
|
self.block.code(op='.reg .' + type, vars=names)
|
||||||
|
[self.block.inject(r.name, r) for r in regs]
|
||||||
|
|
||||||
|
# Pending resolution of the op(regs, guard=x) debate
|
||||||
|
#class Pred(object):
|
||||||
|
#"""
|
||||||
|
#Allows for predicated execution of operations.
|
||||||
|
|
||||||
|
#>>> pred('p_some_test p_another_test')
|
||||||
|
#>>> op.setp.eq.u32(p_some_test, reg1, reg2)
|
||||||
|
#>>> op.setp.and.eq.u32(p_another_test, reg1, reg2, p_some_test)
|
||||||
|
#>>> with p_some_test.is_set():
|
||||||
|
#>>> op.ld.global.u32(reg1, addr(areg))
|
||||||
|
|
||||||
|
#Predication supports nested function calls, and will cover all code
|
||||||
|
#generated inside the predicate block:
|
||||||
|
|
||||||
|
#>>> with p_another_test.is_unset():
|
||||||
|
#>>> some_ptxdsl_function(reg2)
|
||||||
|
#>>> op.st.global.u32(addr(areg), reg2)
|
||||||
|
|
||||||
|
#It is a syntax error to declare registers,
|
||||||
|
#However, multiple predicate blocks cannot be nested. Doing so is a syntax
|
||||||
|
#error.
|
||||||
|
|
||||||
|
#>>> with p_some_test.is_set():
|
||||||
|
#>>> with p_another_test.is_unset():
|
||||||
|
#>>> pass
|
||||||
|
#SyntaxError: ...
|
||||||
|
#"""
|
||||||
|
#def __init__(self, name):
|
||||||
|
#self.name = name
|
||||||
|
#def is_set(self, isnot=False):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Op(_CallChain):
|
||||||
|
"""
|
||||||
|
Performs an operation.
|
||||||
|
|
||||||
|
>>> op.mov.u32(address, mwc_rng_test_sums)
|
||||||
|
>>> op.mad.lo.u32(address, offset, 8, address)
|
||||||
|
>>> op.st.global_.v2.u32(addr(address), vec(mwc_a, mwc_b))
|
||||||
|
|
||||||
|
To make an operation conditional on a predicate, use 'ifp' or 'ifnotp':
|
||||||
|
|
||||||
|
>>> reg.pred('p1')
|
||||||
|
>>> op.setp.eq.u32(p1, reg1, reg2)
|
||||||
|
>>> op.mul.lo.u32(reg1, reg1, reg2, ifp=p1)
|
||||||
|
>>> op.add.u32(reg2, reg1, reg2, ifnotp=p1)
|
||||||
|
|
||||||
|
Note that the global state-space should be written 'global_' to avoid
|
||||||
|
conflict with the Python keyword. `addr` and `vec` are defined in Mem.
|
||||||
|
|
||||||
|
This constructor is available as 'op' in DSL blocks.
|
||||||
|
"""
|
||||||
|
def _call(self, op, *args, ifp=None, ifnotp=None):
|
||||||
|
pred = ''
|
||||||
|
if ifp:
|
||||||
|
if ifnotp:
|
||||||
|
raise SyntaxError("can't use both, fool")
|
||||||
|
pred = ['@', ifp]
|
||||||
|
if ifnotp:
|
||||||
|
pred = ['@!', ifnotp]
|
||||||
|
self.block.append_code(pred, '.'.join(op), map(str, args))
|
||||||
|
|
||||||
|
class Mem(object):
|
||||||
|
"""
|
||||||
|
Reserve memory, optionally with an array size attached.
|
||||||
|
|
||||||
|
>>> mem.global_.u32('global_scalar')
|
||||||
|
>>> mem.local.u32('context_sized_local_array', ctx.threads*4)
|
||||||
|
>>> mem.shared.u32('shared_array', 12)
|
||||||
|
>>> mem.const.u32('const_array_of_unknown_length', True)
|
||||||
|
|
||||||
|
Like registers, memory allocations are injected into the global namespace
|
||||||
|
for use by any functions inside the scope without extra effort.
|
||||||
|
|
||||||
|
>>> with block('move address into memory'):
|
||||||
|
>>> reg.u32('mem_address')
|
||||||
|
>>> op.mov.u32(mem_address, global_scalar)
|
||||||
|
|
||||||
|
This constructor is available as 'mem' in DSL blocks.
|
||||||
|
"""
|
||||||
|
# Pretty much the same as 'Reg', duplicated only for clarity
|
||||||
|
def __init__(self, type, name, array, init):
|
||||||
|
self.type, self.name, self.array, self.init = type, name, array, init
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def vec(*args):
|
||||||
|
"""
|
||||||
|
Prepare vector arguments to a memory operation.
|
||||||
|
|
||||||
|
>>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg))
|
||||||
|
"""
|
||||||
|
return ['{', [(a, ', ') for a in args][:-1], '}']
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def addr(areg, aoffset=''):
|
||||||
|
"""
|
||||||
|
Prepare an address to a memory operation, optionally specifying offset.
|
||||||
|
|
||||||
|
>>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
|
||||||
|
>>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
|
||||||
|
"""
|
||||||
|
return ['[', areg, aoffset and '+' or '', aoffset, ']']
|
||||||
|
|
||||||
|
class _MemFactory(_CallChain):
|
||||||
|
"""Actual `mem` object"""
|
||||||
|
def _call(self, type, name, array=False, initializer=None):
|
||||||
|
assert len(type) == 2
|
||||||
|
memobj = Mem(type, name, array)
|
||||||
|
self.dsl.inject(name, memobj)
|
||||||
|
if array is True:
|
||||||
|
array = ['[]']
|
||||||
|
elif array:
|
||||||
|
array = ['[', array, ']']
|
||||||
|
else:
|
||||||
|
array = []
|
||||||
|
if initializer:
|
||||||
|
array += [' = ', initializer]
|
||||||
|
self.block.code(op=['.%s.%s ' % type, name, array])
|
||||||
|
|
||||||
|
class Label(object):
|
||||||
|
"""
|
||||||
|
Specifies the target for a branch. Scoped in PTX? TODO: test.
|
||||||
|
|
||||||
|
>>> label('infinite_loop')
|
||||||
|
>>> op.bra.uni('label')
|
||||||
|
"""
|
||||||
|
def __init__(self, name):
|
||||||
|
self.name = name
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
class _LabelFactory(object):
|
||||||
|
def __init__(self, block):
|
||||||
|
self.block = block
|
||||||
|
def __call__(self, name):
|
||||||
|
self.block.inject(name, Label(name))
|
||||||
|
|
||||||
|
class PTXFragment(object):
|
||||||
|
def module_setup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def entry_setup(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def entry_teardown(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def globals(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def tests(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def device_init(self, ctx):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class PTXFragment(object):
|
||||||
|
"""
|
||||||
|
An object containing PTX DSL functions.
|
||||||
|
|
||||||
|
In cuflame, several different versions of a given function may be
|
||||||
|
regenerated in rapid succession
|
||||||
|
|
||||||
|
The final compilation pass is guaranteed to have all "tuned" values fixed
|
||||||
|
in their final values for the stream.
|
||||||
|
|
||||||
|
Template code will be processed recursively until all "{{" instances have
|
||||||
|
been replaced, using the same namespace each time.
|
||||||
|
|
||||||
|
Note that any method which does not depend on 'ctx' can be replaced with
|
||||||
|
an instance of the appropriate return type. So, for example, the 'deps'
|
||||||
|
property can be a flat list instead of a function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def deps(self):
|
||||||
|
"""
|
||||||
|
Returns a list of PTXFragment types on which this object depends
|
||||||
|
for successful compilation. Circular dependencies are forbidden,
|
||||||
|
but multi-level dependencies should be fine.
|
||||||
|
"""
|
||||||
|
return [DeviceHelpers]
|
||||||
|
|
||||||
|
def inject(self):
|
||||||
|
"""
|
||||||
|
Returns a dict of items to add to the DSL namespace. The namespace will
|
||||||
|
be assembled in dependency order before any ptx_funcs are called.
|
||||||
|
"""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def module_setup(self):
|
||||||
|
"""
|
||||||
|
PTX function to declare things at module scope. It's a PTX syntax error
|
||||||
|
to perform operations at this scope, but we don't yet validate that at
|
||||||
|
the Python level. A module will call this function on all fragments in
|
||||||
|
dependency order.
|
||||||
|
|
||||||
|
If implemented, this function should use an @ptx_func decorator.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def entry_setup(self):
|
||||||
|
"""
|
||||||
|
PTX DSL function which will insert code at the start of an entry, for
|
||||||
|
initializing variables and stuff like that. An entry point will call
|
||||||
|
this function on all fragments used in that entry point in dependency
|
||||||
|
order.
|
||||||
|
|
||||||
|
If implemented, this function should use an @ptx_func decorator.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def entry_teardown(self):
|
||||||
|
"""
|
||||||
|
PTX DSL function which will insert code at the end of an entry, for any
|
||||||
|
clean-up that needs to be performed. An entry point will call this
|
||||||
|
function on all fragments used in the entry point in *reverse*
|
||||||
|
dependency order (i.e. fragments which this fragment depends on will be
|
||||||
|
cleaned up after this one).
|
||||||
|
|
||||||
|
If implemented, this function should use an @ptx_func decorator.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def tests(self, ctx):
|
||||||
|
"""
|
||||||
|
Returns a list of PTXTest classes which will test this fragment.
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def set_up(self, ctx):
|
||||||
|
"""
|
||||||
|
Do start-of-stream initialization, such as copying data to the device.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
class PTXModule(object):
|
||||||
"""
|
"""
|
||||||
Assembles PTX fragments into a module.
|
Assembles PTX fragments into a module.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, ctx, entries, build_tests=False):
|
def __init__(self, entries, inject={}, build_tests=False):
|
||||||
self.assemble(ctx, entries, build_tests)
|
self._block = b = _Block()
|
||||||
|
self.initial_inject = dict(inject)
|
||||||
|
self._safeupdate(self.initial_inject, dict(block=Block(b),
|
||||||
|
mem=_MemFactory(b), reg=_RegFactory(b), op=Op(b),
|
||||||
|
label=_LabelFactory(b), _block=b)
|
||||||
|
self.needs_recompilation = True
|
||||||
|
self.max_compiles = 10
|
||||||
|
while self.needs_recompilation:
|
||||||
|
self.assemble(entries, build_tests)
|
||||||
|
self.max_compiles -= 1
|
||||||
|
|
||||||
def deporder(self, unsorted_instances, instance_map, ctx):
|
def deporder(self, unsorted_instances, instance_map, ctx):
|
||||||
"""
|
"""
|
||||||
@ -57,7 +564,7 @@ class PTXAssembler(object):
|
|||||||
if non_uniq: raise KeyError("Duplicate keys: %s" % ','.join(key))
|
if non_uniq: raise KeyError("Duplicate keys: %s" % ','.join(key))
|
||||||
dst.update(src)
|
dst.update(src)
|
||||||
|
|
||||||
def assemble(self, ctx, entries, build_tests):
|
def assemble(self, entries, build_tests):
|
||||||
"""
|
"""
|
||||||
Build the PTX source for the given set of entries.
|
Build the PTX source for the given set of entries.
|
||||||
"""
|
"""
|
||||||
@ -121,78 +628,7 @@ class PTXAssembler(object):
|
|||||||
self.instances = instances
|
self.instances = instances
|
||||||
self.tests = tests
|
self.tests = tests
|
||||||
|
|
||||||
class PTXFragment(object):
|
|
||||||
"""
|
|
||||||
Wrapper for sections of template PTX.
|
|
||||||
|
|
||||||
In order to provide the best optimization, and avoid a web of hard-coded
|
|
||||||
parameters, the PTX module may be regenerated and recompiled several times
|
|
||||||
with different or incomplete launch context parameters. To this end, avoid
|
|
||||||
accessing the GPU in such functions, and do not depend on context values
|
|
||||||
which are marked as "tuned" in the LaunchContext docstring being
|
|
||||||
available.
|
|
||||||
|
|
||||||
The final compilation pass is guaranteed to have all "tuned" values fixed
|
|
||||||
in their final values for the stream.
|
|
||||||
|
|
||||||
Template code will be processed recursively until all "{{" instances have
|
|
||||||
been replaced, using the same namespace each time.
|
|
||||||
|
|
||||||
Note that any method which does not depend on 'ctx' can be replaced with
|
|
||||||
an instance of the appropriate return type. So, for example, the 'deps'
|
|
||||||
property can be a flat list instead of a function.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def deps(self, ctx):
|
|
||||||
"""
|
|
||||||
Returns a list of PTXFragment objects on which this object depends
|
|
||||||
for successful compilation. Circular dependencies are forbidden,
|
|
||||||
but multi-level dependencies should be fine.
|
|
||||||
"""
|
|
||||||
return [DeviceHelpers]
|
|
||||||
|
|
||||||
def subs(self, ctx):
|
|
||||||
"""
|
|
||||||
Returns a dict of items to add to the template substitution namespace.
|
|
||||||
The entire dict will be assembled, including all dependencies, before
|
|
||||||
any templates are evaluated.
|
|
||||||
"""
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def prelude(self, ctx):
|
|
||||||
"""
|
|
||||||
Returns a template string containing any code (variable declarations,
|
|
||||||
probably) that should be inserted at module scope. The prelude of
|
|
||||||
all deps will be inserted above this prelude.
|
|
||||||
"""
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def entry_start(self, ctx):
|
|
||||||
"""
|
|
||||||
Returns a template string that should be inserted at the top of any
|
|
||||||
entry point which depends on this method. The entry starts of all
|
|
||||||
deps will be inserted above this entry prelude.
|
|
||||||
"""
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def entry_end(self, ctx):
|
|
||||||
"""
|
|
||||||
As above, but at the end of the calling function, and with the order
|
|
||||||
reversed (all dependencies will be inserted after this).
|
|
||||||
"""
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def tests(self, ctx):
|
|
||||||
"""
|
|
||||||
Returns a list of PTXTest classes which will test this fragment.
|
|
||||||
"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
def set_up(self, ctx):
|
|
||||||
"""
|
|
||||||
Do start-of-stream initialization, such as copying data to the device.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class PTXEntryPoint(PTXFragment):
|
class PTXEntryPoint(PTXFragment):
|
||||||
# Human-readable entry point name
|
# Human-readable entry point name
|
||||||
|
Loading…
Reference in New Issue
Block a user