From a23ebdcf5f783095b949a9e05fa5b016da9133d7 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 27 Aug 2010 12:28:02 -0400 Subject: [PATCH] Initial commit. --- helpers/genprimes.c | 57 ++++++++++ main.py | 257 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+) create mode 100644 helpers/genprimes.c create mode 100644 main.py diff --git a/helpers/genprimes.c b/helpers/genprimes.c new file mode 100644 index 0000000..71962d5 --- /dev/null +++ b/helpers/genprimes.c @@ -0,0 +1,57 @@ +/* Public domain, FWIW + * gcc -o genprimes -lgmp genprimes.c; ./genprimes > primes.bin + * see http://www.ast.cam.ac.uk/~stg20/cuda/random/index.html + */ +#include +#include +#include +#include +#include + +/* To verify the primes against the linked URL, compile this instead: +int main() { + FILE *fp = fopen("primes.bin", "r"); + char stuff[5]; + while (fread(stuff, 4, 1, fp) == 1) { + uint64_t i = *((uint32_t*)stuff); + i = (i>>24) + (1<<8)*((i>>16)&0xff) + (1<<16)*((i>>8)&0xff) + (1<<24)*(i&0xff); + uint64_t j = i * 4294967296L - 1; + uint64_t k = (j-1)/2; + printf("%lu %lu %lu\n", i, j, k); + } +} +*/ + +int main(int argc, char* argv[]) { + fprintf(stderr, "Generating list of multipliers for mod(2^32) MWC RNG\n"); + mpz_t candidate, twotothethirtytwo; + mpz_init(candidate); + mpz_init_set_d(twotothethirtytwo, (double) (4294967296L)); + + char bytes[5]; + bytes[4] = 0; + unsigned int i, found=0; + for (i = 4294967295L; i > 2147483648; i--) { + mpz_set_ui(candidate, i); + mpz_mul(candidate, candidate, twotothethirtytwo); + mpz_sub_ui(candidate, candidate, 1); + if(mpz_probab_prime_p(candidate, 200)) { + mpz_sub_ui(candidate, candidate, 1); + mpz_tdiv_q_ui(candidate, candidate, 2); + if(mpz_probab_prime_p(candidate, 200)) { + bytes[0] = (i>>24)&0xff; + bytes[1] = (i>>16)&0xff; + bytes[2] = (i>>8)&0xff; + bytes[3] = i&0xff; + fwrite(bytes, 4, 1, stdout); + found++; + if (!(found&0xff)) fprintf(stderr, "."); + } + } + } + fprintf(stderr, "\nFound %d multipliers.\n", found); + mpz_clear(candidate); + mpz_clear(twotothethirtytwo); +} + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..c2cdcef --- /dev/null +++ b/main.py @@ -0,0 +1,257 @@ +#!/usr/bin/python +# +# flam3cuda, one of a surprisingly large number of ports of the fractal flame +# algorithm to NVIDIA GPUs. +# +# This one is copyright 2010 Steven Robertson +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 or later +# as published by the Free Software Foundation. + +import os +import sys +import ctypes +import struct + +import tempita + +# These imports are order-sensitive! +import pyglet +import pyglet.gl as gl +gl.get_current_context() + +import pycuda.driver as cuda +import pycuda.gl as cudagl +import pycuda.gl.autoinit +from pycuda.compiler import SourceModule + +from multiprocessing import Process, Queue + +import numpy as np + +from fr0stlib import pyflam3 + +# PTX header and functions used for debugging. +prelude = """ +.version 2.0 +.target sm_20 + +.func (.reg .u32 $ret) get_gtid () +{ + .reg .u16 tmp; + .reg .u32 cta, ncta, tid, gtid; + + mov.u16 tmp, %ctaid.x; + cvt.u32.u16 cta, tmp; + mov.u16 tmp, %ntid.x; + cvt.u32.u16 ncta, tmp; + mul24.lo.u32 gtid, cta, ncta; + + mov.u16 tmp, %tid.x; + cvt.u32.u16 tid, tmp; + add.u32 gtid, gtid, tid; + mov.b32 $ret, gtid; + ret; +} + +.entry write_to_buffer ( .param .u32 bufbase ) +{ + .reg .u32 base, gtid, off; + + ld.param.u32 base, [bufbase]; + call.uni (off), get_gtid, (); + mad24.lo.u32 base, off, 4, base; + st.volatile.global.b32 [base], off; +} +""" + +class CUGenome(pyflam3.Genome): + def _render(self, frame, trans): + obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))() + stats = pyflam3.RenderStats() + pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both, + trans+3, trans, ctypes.byref(stats)) + return obuf, stats, frame + +class LaunchContext(self): + def __init__(self, seed=None): + self.block, self.grid, self.threads = None, None, None + self.stream = cuda.Stream() + self.rand = mtrand.RandomState(seed) + + def set_size(self, block, grid): + self.block, self.grid = block, grid + self.threads = reduce(lambda a, b: a*b, self.block + self.grid) + +class PTXFragment(object): + """ + Wrapper for sections of template PTX. + + In order to provide the best optimization, and avoid a web of hard-coded + parameters, the PTX module may be regenerated and recompiled several times + with different or incomplete launch context parameters. To this end, avoid + accessing the GPU in such functions, and do not depend on context values + which are marked as "tuned" in the LaunchContext docstring being available. + + The final compilation pass is guaranteed to have all "tuned" values fixed + in their final values for the stream. + + Template code will be processed recursively until all "{{" instances have + been replaced, using the same namespace each time. + """ + + def deps(self, ctx): + """ + Returns a list of PTXFragment objects on which this object depends + for successful compilation. Circular dependencies are forbidden. + """ + return [] + + def subs(self, ctx): + """ + Returns a dict of items to add to the template substitution namespace. + The entire dict will be assembled, including all dependencies, before + any templates are evaluated. + """ + return {} + + def prelude(self, ctx): + """ + Returns a template string containing any code (variable declarations, + probably) that should be inserted at module scope. The prelude of + all deps will be inserted above this prelude. + """ + return "" + + def entryPrelude(self, ctx): + """ + Returns a template string that should be inserted at the top of any + entry point which depends on this method. The entry prelude of all + deps will be inserted above this entry prelude. + """ + return "" + + def setUp(self, ctx): + """ + Do start-of-stream initialization, such as copying data to the device. + """ + pass + + def test(self, ctx): + """ + Perform device tests. Returns True on success, False on failure, + or raises an exception. + """ + return True + +class PTXEntryPoint(PTXFragment): + def entry(self, ctx): + """ + Returns a template string corresponding to a PTX entry point. + """ + pass + + def call(self, ctx): + """ + Calls the entry point on the device. Haven't worked out the details + of this one yet. + """ + pass + + + +class DeviceHelpers(PTXFragment): + """This one's included by default, no need to depend on it""" + def subs(self, ctx): + return { + 'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32', + } + +class MWCRandGen(PTXFragment): + + _prelude = """ + .const {{PTRT}} mwc_rng_mults_p; + .const {{PTRT}} mwc_rng_seeds_p; + """ + + def __init__(self): + if not os.path.isfile(os.path.join(os.path.dirname(__FILE__), + 'primes.bin')): + raise EnvironmentError('primes.bin not found') + + def prelude(self): + return self._prelude + + def setUp(self, ctx): + # Load raw big-endian u32 multipliers from primes.bin. + with open('primes.bin') as primefp: + dt = np.dtype(np.uint32).newbyteorder('B') + mults = np.frombuffer(primefp.read(), dtype=dt) + # Randomness in choosing multipliers is good, but larger multipliers + # have longer periods, which is also good. This is a compromise. + ctx.rand.shuffle(mults[:ctx.threads*4]) + # Copy multipliers and seeds to the device + devmp, devml = ctx.mod.get_global('mwc_rng_mults') + cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream) + devsp, devsl = ctx.mod.get_global('mwc_rng_seeds') + cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream) + + def _next_b32(self, dreg): + return """ + mul.wide.u32 mwc_rng_ + mul.wide.u32 + + + + def templates(self, ctx): + return {'mwc_next_b32', self._next_b32} + + + def test(self, ctx): + + + + + + + + + def launch(self, ctx): + if self.mults + + + +def main(genome_path): + + + + + #with open(genome_path) as fp: + #genome = CUGenome.from_string(fp.read())[0] + #genome.width, genome.height = 512, 512 + #genome.sample_density = 1000 + #obuf, stats, frame = genome.render(estimator=3) + #gc.collect() + + ##q.put(str(obuf)) + ##p = Process(target=render, args=(q, genome_path)) + ##p.start() + + #window = pyglet.window.Window() + #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf) + #tex = image.texture + + #@window.event + #def on_draw(): + #window.clear() + #tex.blit(0, 0) + + #pyglet.app.run() + +if __name__ == "__main__": + if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]): + print "First argument must be a path to a genome file" + sys.exit(1) + main(sys.argv[1]) +