diff --git a/cuburn/render.py b/cuburn/render.py index 8ffc5cc..e70c4de 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -200,17 +200,26 @@ class Renderer(object): # asynchronous, and avoid expensive CPU polling, this hangs on to # a number of (relatively small) CUDA modules and flushes them together. MAX_MODREFS = 20 - _modrefs = [] + _modrefs = {} + + @classmethod + def compile(cls, gnm, arch=None): + packer, lib = iter.mkiterlib(gnm) + cubin = util.compile('iter', assemble_code(lib), arch=arch) + return packer, lib, cubin + + def load(self, cubin): + if cubin in self._modrefs: + return self._modrefs[cubin] + mod = cuda.module_from_buffer(self.cubin) + if len(self._modrefs) > self.MAX_MODREFS: + self._modrefs.clear() + self._modrefs[cubin] = mod + return mod def __init__(self, gnm, gprof): - self.packer, self.lib = iter.mkiterlib(gnm) - cubin = util.compile('iter', assemble_code(self.lib)) - self.mod = cuda.module_from_buffer(cubin) - - if len(self._modrefs) > self.MAX_MODREFS: - del self._modrefs[:] - self._modrefs.append(self.mod) - + self.packer, self.lib, self.cubin = self.compile(gnm) + self.mod = self.load(self.cubin) self.filts = filters.create(gprof) self.out = output.PILOutput()