diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index b1e2dd0..a12bcea 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -96,7 +96,7 @@ def assemble_code(*libs):
 DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-lineinfo')
 DEFAULT_SAVE_KERNEL = True
 def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL,
-            arch=None):
+            arch=None, keep=False):
     """
     Compile a module. Returns a copy of the source (for inspection or
     display) and the compiled cubin.
@@ -105,7 +105,8 @@ def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL,
     if save:
         with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
             fp.write(src)
-    cubin = pycuda.compiler.compile(src, options=list(opts), arch=arch)
+    cubin = pycuda.compiler.compile(src, options=list(opts), arch=arch,
+                                    keep=keep)
     if save:
         with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
             fp.write(cubin)
diff --git a/cuburn/render.py b/cuburn/render.py
index 41fb2ed..7405fa2 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -218,9 +218,9 @@ class Renderer(object):
     _modrefs = {}
 
     @classmethod
-    def compile(cls, gnm, arch=None):
+    def compile(cls, gnm, arch=None, keep=False):
         packer, lib = iter.mkiterlib(gnm)
-        cubin = util.compile('iter', assemble_code(lib), arch=arch)
+        cubin = util.compile('iter', assemble_code(lib), arch=arch, keep=keep)
         return packer, lib, cubin
 
     def load(self, cubin):
@@ -232,8 +232,8 @@ class Renderer(object):
         self._modrefs[cubin] = mod
         return mod
 
-    def __init__(self, gnm, gprof):
-        self.packer, self.lib, self.cubin = self.compile(gnm)
+    def __init__(self, gnm, gprof, keep=False, arch=None):
+        self.packer, self.lib, self.cubin = self.compile(gnm, keep=keep, arch=arch)
         self.mod = self.load(self.cubin)
         self.filts = filters.create(gprof)
         self.out = output.get_output_for_profile(gprof)
diff --git a/main.py b/main.py
index 6abfb65..a38830d 100755
--- a/main.py
+++ b/main.py
@@ -125,7 +125,10 @@ def main(args, prof):
 
     try:
       rmgr = render.RenderManager()
-      rdr = render.Renderer(gnm, gprof)
+      arch = 'sm_{}{}'.format(
+          dev.get_attribute(cuda.device_attribute.COMPUTE_CAPABILITY_MAJOR),
+          dev.get_attribute(cuda.device_attribute.COMPUTE_CAPABILITY_MINOR))
+      rdr = render.Renderer(gnm, gprof, keep=args.keep, arch=arch)
 
       def render_iter():
           m = os.path.getmtime(args.flame)
@@ -191,6 +194,8 @@ if __name__ == "__main__":
         help="Print the blended animation and exit.")
     parser.add_argument('--device', metavar='NUM', type=int,
                         help="GPU device number to use (from nvidia-smi).")
+    parser.add_argument('--keep', action='store_true',
+                        help="Keep compiled kernels to help with profiling")
     profile.add_args(parser)
 
     args = parser.parse_args()