mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 03:30:05 -05:00
Initial commit.
This commit is contained in:
commit
a23ebdcf5f
57
helpers/genprimes.c
Normal file
57
helpers/genprimes.c
Normal file
@ -0,0 +1,57 @@
|
||||
/* Public domain, FWIW
|
||||
* gcc -o genprimes -lgmp genprimes.c; ./genprimes > primes.bin
|
||||
* see http://www.ast.cam.ac.uk/~stg20/cuda/random/index.html
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <gmp.h>
|
||||
|
||||
/* To verify the primes against the linked URL, compile this instead:
|
||||
int main() {
|
||||
FILE *fp = fopen("primes.bin", "r");
|
||||
char stuff[5];
|
||||
while (fread(stuff, 4, 1, fp) == 1) {
|
||||
uint64_t i = *((uint32_t*)stuff);
|
||||
i = (i>>24) + (1<<8)*((i>>16)&0xff) + (1<<16)*((i>>8)&0xff) + (1<<24)*(i&0xff);
|
||||
uint64_t j = i * 4294967296L - 1;
|
||||
uint64_t k = (j-1)/2;
|
||||
printf("%lu %lu %lu\n", i, j, k);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
fprintf(stderr, "Generating list of multipliers for mod(2^32) MWC RNG\n");
|
||||
mpz_t candidate, twotothethirtytwo;
|
||||
mpz_init(candidate);
|
||||
mpz_init_set_d(twotothethirtytwo, (double) (4294967296L));
|
||||
|
||||
char bytes[5];
|
||||
bytes[4] = 0;
|
||||
unsigned int i, found=0;
|
||||
for (i = 4294967295L; i > 2147483648; i--) {
|
||||
mpz_set_ui(candidate, i);
|
||||
mpz_mul(candidate, candidate, twotothethirtytwo);
|
||||
mpz_sub_ui(candidate, candidate, 1);
|
||||
if(mpz_probab_prime_p(candidate, 200)) {
|
||||
mpz_sub_ui(candidate, candidate, 1);
|
||||
mpz_tdiv_q_ui(candidate, candidate, 2);
|
||||
if(mpz_probab_prime_p(candidate, 200)) {
|
||||
bytes[0] = (i>>24)&0xff;
|
||||
bytes[1] = (i>>16)&0xff;
|
||||
bytes[2] = (i>>8)&0xff;
|
||||
bytes[3] = i&0xff;
|
||||
fwrite(bytes, 4, 1, stdout);
|
||||
found++;
|
||||
if (!(found&0xff)) fprintf(stderr, ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\nFound %d multipliers.\n", found);
|
||||
mpz_clear(candidate);
|
||||
mpz_clear(twotothethirtytwo);
|
||||
}
|
||||
|
||||
|
257
main.py
Normal file
257
main.py
Normal file
@ -0,0 +1,257 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# flam3cuda, one of a surprisingly large number of ports of the fractal flame
|
||||
# algorithm to NVIDIA GPUs.
|
||||
#
|
||||
# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 or later
|
||||
# as published by the Free Software Foundation.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes
|
||||
import struct
|
||||
|
||||
import tempita
|
||||
|
||||
# These imports are order-sensitive!
|
||||
import pyglet
|
||||
import pyglet.gl as gl
|
||||
gl.get_current_context()
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.gl as cudagl
|
||||
import pycuda.gl.autoinit
|
||||
from pycuda.compiler import SourceModule
|
||||
|
||||
from multiprocessing import Process, Queue
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fr0stlib import pyflam3
|
||||
|
||||
# PTX header and functions used for debugging.
|
||||
prelude = """
|
||||
.version 2.0
|
||||
.target sm_20
|
||||
|
||||
.func (.reg .u32 $ret) get_gtid ()
|
||||
{
|
||||
.reg .u16 tmp;
|
||||
.reg .u32 cta, ncta, tid, gtid;
|
||||
|
||||
mov.u16 tmp, %ctaid.x;
|
||||
cvt.u32.u16 cta, tmp;
|
||||
mov.u16 tmp, %ntid.x;
|
||||
cvt.u32.u16 ncta, tmp;
|
||||
mul24.lo.u32 gtid, cta, ncta;
|
||||
|
||||
mov.u16 tmp, %tid.x;
|
||||
cvt.u32.u16 tid, tmp;
|
||||
add.u32 gtid, gtid, tid;
|
||||
mov.b32 $ret, gtid;
|
||||
ret;
|
||||
}
|
||||
|
||||
.entry write_to_buffer ( .param .u32 bufbase )
|
||||
{
|
||||
.reg .u32 base, gtid, off;
|
||||
|
||||
ld.param.u32 base, [bufbase];
|
||||
call.uni (off), get_gtid, ();
|
||||
mad24.lo.u32 base, off, 4, base;
|
||||
st.volatile.global.b32 [base], off;
|
||||
}
|
||||
"""
|
||||
|
||||
class CUGenome(pyflam3.Genome):
|
||||
def _render(self, frame, trans):
|
||||
obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
|
||||
stats = pyflam3.RenderStats()
|
||||
pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
|
||||
trans+3, trans, ctypes.byref(stats))
|
||||
return obuf, stats, frame
|
||||
|
||||
class LaunchContext(self):
|
||||
def __init__(self, seed=None):
|
||||
self.block, self.grid, self.threads = None, None, None
|
||||
self.stream = cuda.Stream()
|
||||
self.rand = mtrand.RandomState(seed)
|
||||
|
||||
def set_size(self, block, grid):
|
||||
self.block, self.grid = block, grid
|
||||
self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
|
||||
|
||||
class PTXFragment(object):
|
||||
"""
|
||||
Wrapper for sections of template PTX.
|
||||
|
||||
In order to provide the best optimization, and avoid a web of hard-coded
|
||||
parameters, the PTX module may be regenerated and recompiled several times
|
||||
with different or incomplete launch context parameters. To this end, avoid
|
||||
accessing the GPU in such functions, and do not depend on context values
|
||||
which are marked as "tuned" in the LaunchContext docstring being available.
|
||||
|
||||
The final compilation pass is guaranteed to have all "tuned" values fixed
|
||||
in their final values for the stream.
|
||||
|
||||
Template code will be processed recursively until all "{{" instances have
|
||||
been replaced, using the same namespace each time.
|
||||
"""
|
||||
|
||||
def deps(self, ctx):
|
||||
"""
|
||||
Returns a list of PTXFragment objects on which this object depends
|
||||
for successful compilation. Circular dependencies are forbidden.
|
||||
"""
|
||||
return []
|
||||
|
||||
def subs(self, ctx):
|
||||
"""
|
||||
Returns a dict of items to add to the template substitution namespace.
|
||||
The entire dict will be assembled, including all dependencies, before
|
||||
any templates are evaluated.
|
||||
"""
|
||||
return {}
|
||||
|
||||
def prelude(self, ctx):
|
||||
"""
|
||||
Returns a template string containing any code (variable declarations,
|
||||
probably) that should be inserted at module scope. The prelude of
|
||||
all deps will be inserted above this prelude.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def entryPrelude(self, ctx):
|
||||
"""
|
||||
Returns a template string that should be inserted at the top of any
|
||||
entry point which depends on this method. The entry prelude of all
|
||||
deps will be inserted above this entry prelude.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def setUp(self, ctx):
|
||||
"""
|
||||
Do start-of-stream initialization, such as copying data to the device.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test(self, ctx):
|
||||
"""
|
||||
Perform device tests. Returns True on success, False on failure,
|
||||
or raises an exception.
|
||||
"""
|
||||
return True
|
||||
|
||||
class PTXEntryPoint(PTXFragment):
|
||||
def entry(self, ctx):
|
||||
"""
|
||||
Returns a template string corresponding to a PTX entry point.
|
||||
"""
|
||||
pass
|
||||
|
||||
def call(self, ctx):
|
||||
"""
|
||||
Calls the entry point on the device. Haven't worked out the details
|
||||
of this one yet.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class DeviceHelpers(PTXFragment):
|
||||
"""This one's included by default, no need to depend on it"""
|
||||
def subs(self, ctx):
|
||||
return {
|
||||
'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
|
||||
}
|
||||
|
||||
class MWCRandGen(PTXFragment):
|
||||
|
||||
_prelude = """
|
||||
.const {{PTRT}} mwc_rng_mults_p;
|
||||
.const {{PTRT}} mwc_rng_seeds_p;
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
|
||||
'primes.bin')):
|
||||
raise EnvironmentError('primes.bin not found')
|
||||
|
||||
def prelude(self):
|
||||
return self._prelude
|
||||
|
||||
def setUp(self, ctx):
|
||||
# Load raw big-endian u32 multipliers from primes.bin.
|
||||
with open('primes.bin') as primefp:
|
||||
dt = np.dtype(np.uint32).newbyteorder('B')
|
||||
mults = np.frombuffer(primefp.read(), dtype=dt)
|
||||
# Randomness in choosing multipliers is good, but larger multipliers
|
||||
# have longer periods, which is also good. This is a compromise.
|
||||
ctx.rand.shuffle(mults[:ctx.threads*4])
|
||||
# Copy multipliers and seeds to the device
|
||||
devmp, devml = ctx.mod.get_global('mwc_rng_mults')
|
||||
cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
|
||||
devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
|
||||
cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
|
||||
|
||||
def _next_b32(self, dreg):
|
||||
return """
|
||||
mul.wide.u32 mwc_rng_
|
||||
mul.wide.u32
|
||||
|
||||
|
||||
|
||||
def templates(self, ctx):
|
||||
return {'mwc_next_b32', self._next_b32}
|
||||
|
||||
|
||||
def test(self, ctx):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def launch(self, ctx):
|
||||
if self.mults
|
||||
|
||||
|
||||
|
||||
def main(genome_path):
|
||||
|
||||
|
||||
|
||||
|
||||
#with open(genome_path) as fp:
|
||||
#genome = CUGenome.from_string(fp.read())[0]
|
||||
#genome.width, genome.height = 512, 512
|
||||
#genome.sample_density = 1000
|
||||
#obuf, stats, frame = genome.render(estimator=3)
|
||||
#gc.collect()
|
||||
|
||||
##q.put(str(obuf))
|
||||
##p = Process(target=render, args=(q, genome_path))
|
||||
##p.start()
|
||||
|
||||
#window = pyglet.window.Window()
|
||||
#image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
|
||||
#tex = image.texture
|
||||
|
||||
#@window.event
|
||||
#def on_draw():
|
||||
#window.clear()
|
||||
#tex.blit(0, 0)
|
||||
|
||||
#pyglet.app.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
|
||||
print "First argument must be a path to a genome file"
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
|
Loading…
Reference in New Issue
Block a user