diff --git a/cuburn/device_code.py b/cuburn/device_code.py index 375e8a7..02efd3b 100644 --- a/cuburn/device_code.py +++ b/cuburn/device_code.py @@ -15,13 +15,13 @@ from cuburn.variations import Variations class IterThread(PTXEntryPoint): entry_name = 'iter_thread' entry_params = [] - maxnreg = 16 def __init__(self): self.cps_uploaded = False def deps(self): - return [MWCRNG, CPDataStream, HistScatter, Variations, Timeouter] + return [MWCRNG, CPDataStream, HistScatter, Variations, ShufflePoints, + Timeouter] @ptx_func def module_setup(self): @@ -48,7 +48,7 @@ class IterThread(PTXEntryPoint): mem.shared.f32('s_xf_sel', ctx.warps_per_cta) - std.store_per_thread(g_whatever, 1234) + #std.store_per_thread(g_whatever, 1234) # TODO: temporary, for testing mem.local.u32('l_num_rounds') @@ -56,13 +56,11 @@ class IterThread(PTXEntryPoint): op.st.local.u32(addr(l_num_rounds), 0) op.st.local.u32(addr(l_num_writes), 0) - mem.local.f32('l_consec') - op.st.local.f32(addr(l_consec), 0.) - - reg.f32('x_coord y_coord color_coord') - mwc.next_f32_11(x_coord) - mwc.next_f32_11(y_coord) - mwc.next_f32_01(color_coord) + reg.f32('xi xo yi yo colori coloro consec_bad') + mwc.next_f32_11(xi) + mwc.next_f32_11(yi) + mwc.next_f32_01(colori) + op.mov.f32(consec_bad, 0.) comment("Ensure all init is done") op.bar.sync(0) @@ -104,6 +102,7 @@ class IterThread(PTXEntryPoint): # When fusing, num_samples holds the (negative) number of iterations # left across the CP, rather than the number of samples in total. with block("If still fusing, increment count unconditionally"): + op.bar.sync(0) std.set_is_first_thread(reg.pred('p_is_first')) op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first) @@ -133,8 +132,6 @@ class IterThread(PTXEntryPoint): op.add.u32(num_rounds, num_rounds, 1) op.st.local.u32(addr(l_num_rounds), num_rounds) - - with block("Select an xform"): reg.f32('xf_sel') reg.u32('warp_offset xf_sel_addr') @@ -154,12 +151,9 @@ class IterThread(PTXEntryPoint): for xf in features.xforms: label('XFORM_%d' % xf.id) - variations.apply_xform(x_coord, y_coord, color_coord, - x_coord, y_coord, color_coord, xf.id) + variations.apply_xform(xo, yo, coloro, xi, yi, colori, xf.id) op.bra.uni("xform_done") - - label("xform_done") with block("Test if we're still in FUSE"): reg.s32('num_samples') @@ -170,7 +164,7 @@ class IterThread(PTXEntryPoint): reg.pred('p_point_is_valid') with block("Write the result"): - hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid) + hist.scatter(xo, yo, coloro, 0, p_point_is_valid) with block(): reg.u32('num_writes') op.ld.local.u32(num_writes, addr(l_num_writes)) @@ -180,17 +174,23 @@ class IterThread(PTXEntryPoint): with block("If the result was invalid, handle badvals"): reg.f32('consec') reg.pred('need_new_point') - op.ld.local.f32(consec, addr(l_consec)) - op.mov.f32(consec, 0., ifp=p_point_is_valid) + comment('If point is good, move new coords and reset consec_bad') + op.mov.f32(xi, xo, ifp=p_point_is_valid) + op.mov.f32(yi, yo, ifp=p_point_is_valid) + op.mov.f32(colori, coloro, ifp=p_point_is_valid) + op.mov.f32(consec_bad, 0., ifp=p_point_is_valid) + + comment('Otherwise, add 1 to consec_bad') op.add.f32(consec, consec, 1., ifnotp=p_point_is_valid) op.setp.ge.f32(need_new_point, consec, 5.) op.bra('badval_done', ifnotp=need_new_point) - mwc.next_f32_11(x_coord) - mwc.next_f32_11(y_coord) - mwc.next_f32_01(color_coord) + + comment('If consec_bad > 5, pick a new random point') + mwc.next_f32_11(xi) + mwc.next_f32_11(yi) + mwc.next_f32_01(colori) op.mov.f32(consec, 0.) label('badval_done') - op.st.local.f32(addr(l_consec), consec) with block("Increment number of samples by number of good values"): reg.b32('good_samples laneid') @@ -205,11 +205,16 @@ class IterThread(PTXEntryPoint): with block("Check to see if we're done with this CP"): reg.pred('p_cp_done') reg.s32('num_samples num_samples_needed') + comment('Sync before making decision to prevent divergence') + op.bar.sync(3) op.ld.shared.s32(num_samples, addr(s_num_samples)) cp.get(cpA, num_samples_needed, 'cp.nsamples') op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed) op.bra.uni(cp_loop_start, ifp=p_cp_done) + comment('Shuffle points between threads') + shuf.shuffle(xi, yi, colori, consec_bad) + with block("If first warp, pick new thread offset"): reg.u32('warpid') reg.pred('first_warp') @@ -273,7 +278,7 @@ class IterThread(PTXEntryPoint): whatever = cuda.from_device(whatever_dp, shape, np.int32) print_thing("Rounds", rounds) print_thing("Writes", writes) - print_thing("Whatever", whatever) + #print_thing("Whatever", whatever) print np.sum(rounds) @@ -495,6 +500,41 @@ class HistScatter(PTXFragment): dtype=np.float32) +class ShufflePoints(PTXFragment): + """ + Shuffle points in shared memory. See helpers/shuf.py for details. + """ + shortname = "shuf" + + @ptx_func + def module_setup(self): + # TODO: if needed, merge this shared memory block with others + mem.shared.f32('s_shuf_data', ctx.threads_per_cta) + + @ptx_func + def shuffle(self, *args, **kwargs): + """ + Shuffle the data from each register in args across threads. Keyword + argument ``bar`` specifies which barrier to use. + """ + bar = kwargs.pop('bar', 8) + with block("Shuffle across threads"): + reg.u32('shuf_read shuf_write') + with block("Calculate read and write offsets"): + reg.u32('shuf_off shuf_laneid') + op.mov.u32(shuf_off, '%tid.x') + op.mov.u32(shuf_write, s_shuf_data) + op.mad.lo.u32(shuf_write, shuf_off, 4, shuf_write) + op.mov.u32(shuf_laneid, '%laneid') + op.mad.lo.u32(shuf_off, shuf_laneid, 32, shuf_off) + op.and_.b32(shuf_off, shuf_off, ctx.threads_per_cta - 1) + op.mov.u32(shuf_read, s_shuf_data) + op.mad.lo.u32(shuf_read, shuf_off, 4, shuf_read) + for var in args: + op.bar.sync(bar) + op.st.shared.b32(addr(shuf_write), var) + op.bar.sync(bar) + op.ld.shared.b32(var, addr(shuf_read)) class MWCRNG(PTXFragment): shortname = "mwc" diff --git a/helpers/shuf.py b/helpers/shuf.py new file mode 100644 index 0000000..5ea1ae2 --- /dev/null +++ b/helpers/shuf.py @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- + +""" +Examples and documentation for the point shuffle. Run this file to produce the +full output with annotations. +""" + +import numpy as np + +w, t = 0, 0 + +def p(a): + for i in range(0, t, w): + print ' ' + ' '.join(['%2d' % a[j] for j in range(i,i+w)]) + +def make(): + a = np.ndarray(t, dtype=np.int32) + for i in range(t/w): + for j in range(w): + a[i*w+j] = i + return a + +for i in range(w): + a = shuf(a) + p(a) + +print """ +The problem +----------- + +If two points undergo the same transforms many times, they will begin to +converge. Since every thread in a warp selects the same transforms on each +iteration, leaving a given point in the same warp each time would effectively +cut the number of unique samples generated by 32. + +One fix is to allow transforms to diverge, but that has a direct impact upon +performance. Another is to simply swap points between different warps. This is +what we do here. + +For performance reasons, we can't do the swap in global memory, or between +multiple CTAs; the cost of synchronization and the necessarily uncoalesced +write pattern would put an enormous (orders of magnitude) dent in throughput. +As a result, we have to keep things in shared memory, which is fast but has a +few restrictions, of which we note two: + + - There's not a lot of it. 1K is a good target; it gives the memory kernel + enough room in a 16K shared configuration to use 8K while retaining the + 4-CTA-per-SM allocation we're shooting for. (More on this later.) + + - No two threads in a warp should write to a different bank. More + specifically, for an address ``a``, the value ``a % 128`` should be + unique for each thread. See the CUDA docs for details. + +Mixing it up +------------ + +A simple algorithm for doing the swap which respects those constraints is given +below. In this algorithm, for each coordinate value, each warp writes the value +to memory, waits for all other warps to do the same, then reads back the new +value. The trick is in the addresses which are read: + + Let ``w`` be the warp size, ``n`` be the warp number within a CTA, ``l`` be + the lane ID, and ``t`` be the total number of threads within a CTA. The + address to which a point should be written is given by:: + + a = (l + n * w) % t + + For reading, the address is given by:: + + a = (l + (n + l) * w) % t + +Note that the write address in this case is simply the thread ID. +""" + +def shuf_simple(a): + b = np.empty_like(a) + for n in range(t/w): + for l in range(w): + b[(l+(n+l)*w)%t] = a[l+n*w] + return b + +print """ +It may help to visualize this. Here's a representation of a "before" state. In +this block, rows represent different warps, while columns represent different +thread lanes; if addresses proceed from left to right, top to bottom, the +address would be the same as the thread ID as assigned by the GPU. Each value +in a particular cell represents the warp number of the cell that the point +started in, in each example. + +For these examples, the warp size ``w`` is 16, and ``t`` is 16*16=256. :: +""" + +w, t = 16, 256 +a = make() +p(a) + +print """ +After one round of the simple shuffle given above, the matrix looks like this:: +""" + +a = shuf_simple(a) +p(a) + +print """ +This demonstrates two attractive properties: + + - Both reads and writes satisfy the constraints to avoid bank conflicts + when writing to shared memory. + + - No point ends up in a warp alongside another from the same origin. + +Of course, we should investigate its properties on continued iterations. Here's +round 2:: +""" + +a = shuf_simple(a) +p(a) + +print "\nAnd 3::\n" + +a = shuf_simple(a) +p(a) + +print """ +We've been doing farly good so far, but at the fourth iteration, things don't +look as nice:: +""" + +a = shuf_simple(a) +p(a) + +print """ +This *looks* bad, with a lot of points grouped together, but that may not be an +obvious indicator of poor performance in a real situation. The linear +relationship of the shuffle is obvious from the tables, and indeed some simple +modular algebra can identify the patterns, but since I am a better programmer +than a mathematician, I'm going to resort to empirical methods. + +"Meet my good friend, Monte Carlo" +---------------------------------- + +Since the iterative part of the flame algorithm is itself essentially a Monte +Carlo simulation, it is appropriate that we use one to characterize its +implementation. In fact, the test algorithm is remarkably similar to the way +color values are calculated in the flame algorithm itself: + + - Define a list of affine transformations, each with a certain probability + of being selected. + + - For several iterations, in each warp: + + - Choose one affine transformation from the list according to the + associated probabilities. + + - Apply it to the current coordinate. + +We'll use a 1D affine transformation (``x_n = A * x_{n-1} + B``) and equal +probabilities. After each iteration, we'll calculate the standard deviation of +the output of each transform seprately, and sum the total results [#]. If there +is more diversity among the input values, the final number should be larger; an +exceptionally small number indicates reduced variability among the inputs. + +.. [#] To be honest, I'm not entirely sure *what* this represents, + statistically, but it *feels* right, and that's what counts. + +The initial inputs will be obtained using the same method as the flame +algorithm: run random points through the algorithm a few times without +recording the results. If each affine matrix is convergent, this should quickly +produce suitable input points. + +To have something to compare it to, we specify two alternative shuffles. The +first is no shuffle per warp, which is expected to be rather terrible. The +second is a completely random shuffle of the entire matrix in each round, +something that's untenable on the device (at least with any algorithm I've yet +thought of). I've also included the results of running this test with an +effective warp width of 1, which would simulate the results of running a +different transformation per thread regardless of warp configuration. +""" + +def shuf_none(a): + return a + +def shuf_all(a): + np.random.shuffle(a) + return a + +print """ +Here's the set of affine transformations we'll be using: +""" + +FUSE = 20 +aff = [(0.5, 0, 0.3), (0.5, 0.5, 0.3), (0.5, 1, 0.3), (0.25, 0.75, 0.1)] +fmtstr = ' %-10s %-10s %s' +print fmtstr % ('A', 'B', 'probability of selection') +for k in aff: + print fmtstr % k + +print "\nAnd here are the results::\n" + +def monte(a, shuf, rounds, xf_w): + # Shuffle `a` with `shuf` for `rounds`, and for each group of `xf_w` + # threads, pick a new transform, apply it, and accumulate the result + stdsum = 0 + for round in range(-FUSE, rounds): + xf_pts = [[] for i in aff] + for wrp in range(0, t, xf_w): + aff_id = 0 + sel = np.random.rand(1)[0] + while sel > aff[aff_id][2]: + sel -= aff[aff_id][2] + aff_id += 1 + a[wrp:wrp+xf_w] = a[wrp:wrp+xf_w] * aff[aff_id][0] + aff[aff_id][1] + xf_pts[aff_id].extend(a[wrp:wrp+xf_w]) + if round >= 0: + stdsum += sum([np.std(x) for x in xf_pts]) + a = shuf(a) + return stdsum / rounds + +print ' With no shuffle: %g' % monte(make(), shuf_none, 1000, 16) +print ' With full shuffle: %g' % monte(make(), shuf_all, 1000, 16) +print ' With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 16) +print ' With warp_width=1: %g' % monte(make(), shuf_none, 1000, 1) + +print """ +Failing to shuffle the points clearly has a considerable negative impact on our +ersatz "diversity" criterion. Fully shuffling the set of points each time +substantially increases this result. It's not enough to close the gap between +transform choice by warp and by thread, but it does improve things. + +A simple shuffle actually beats the full shuffle consistently by a few points. +This may seem like a flawed result — "more shuffling means more diversity" — +but it's actually expected. The simple shuffle guarantees that two points +will never move together over one round, but the full shuffle happily reunites +these neighbors from the previous thread, lowering its performance. + +As it turns out, the simple shuffle does quite well, despite its strong linear +dependence. If this was the configuration of the device, I would actually be +satisfied putting this into play directly. + +But things are, of course, messier. + +Hip to be square +---------------- + +The previous examples make use of a 16-thread warp size, while NVIDIA devices +actually make use of a 32-thread warp. Since we can realistically only fit 256 +threads in a warp, this leaves us with a CTA geometry of 32*8, rather than +16*16. To see what this implies for our setup, check out the examples of simple +rotation on a 16*4 matrix, which has the same aspect ratio as the 32*8 device +would:: +""" + +t = 64 +a = make() +for i in range(5): + p(a) + print + a = shuf_simple(a) + +print """ +The simple shuffle was designed with a square grid in mind, and performs worse +when on the 32*8 device grid:: +""" + +t = 256 +w = 32 + +print ' With no shuffle: %g' % monte(make(), shuf_none, 1000, 32) +print ' With full shuffle: %g' % monte(make(), shuf_all, 1000, 32) +print ' With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 32) +print ' With warp_width=1: %g' % monte(make(), shuf_none, 1000, 1) + +def shuf_better(a): + b = np.empty_like(a) + for n in range(t/w): + r = np.random.randint(32) + for l in range(w): + b[(l+(n+l)*w)%t] = a[(l+r)%w+n*w] + return b + +print ' With better shuffle: %g' % monte(make(), shuf_better, 1000, 32) + +print """ +Okay I actually intended this to be a blog post but I started writing before +having done any of the math. Actually the simple shuffle looks like it's +sufficient for now, and that's what I'm going to implement. +""" diff --git a/main.py b/main.py index f313676..0ed10bc 100644 --- a/main.py +++ b/main.py @@ -41,9 +41,10 @@ def main(args): anim.compile() bins = anim.render_frame() #bins = np.log2(bins + 1) - bins *= (512./(np.mean([bins[y][x][3] - for x in range(anim.features.hist_width) - for y in range(anim.features.hist_height)])+1e-9)) + alpha = [bins[y][x][3] for x in range(anim.features.hist_width) + for y in range(anim.features.hist_height)] + print sum(alpha) + bins *= (512./(np.mean(alpha)+1e-9)) bins = np.minimum(bins, 255) bins = bins.astype(np.uint8)