mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Allow swapping out store methods for easier testing of performance
This commit is contained in:
		@ -120,10 +120,7 @@ class IterThread(PTXEntryPoint):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        label('iter_loop_start')
 | 
					        label('iter_loop_start')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        timeout.check_time(10)
 | 
					        #timeout.check_time(10)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # TODO: diagram and fix syncing (can this be automated?)
 | 
					 | 
				
			||||||
        #op.bar.sync(1)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with block():
 | 
					        with block():
 | 
				
			||||||
            reg.u32('num_rounds')
 | 
					            reg.u32('num_rounds')
 | 
				
			||||||
@ -453,13 +450,26 @@ class HistScatter(PTXFragment):
 | 
				
			|||||||
    def module_setup(self):
 | 
					    def module_setup(self):
 | 
				
			||||||
        mem.global_.f32('g_hist_bins',
 | 
					        mem.global_.f32('g_hist_bins',
 | 
				
			||||||
                        features.hist_height * features.hist_stride * 4)
 | 
					                        features.hist_height * features.hist_stride * 4)
 | 
				
			||||||
 | 
					        comment("Target to ensure fake local values get written")
 | 
				
			||||||
 | 
					        mem.global_.f32('g_hist_dummy')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
    def entry_setup(self):
 | 
					    def entry_setup(self):
 | 
				
			||||||
        comment("For now, assume histogram bins have been cleared by host")
 | 
					        comment("Fake bins for fake scatter")
 | 
				
			||||||
 | 
					        mem.local.f32('l_scatter_fake_adr')
 | 
				
			||||||
 | 
					        mem.local.f32('l_scatter_fake_alpha')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
    def scatter(self, x, y, color, xf_idx, p_valid=None):
 | 
					    def entry_teardown(self):
 | 
				
			||||||
 | 
					        with block("Store fake histogram bins to dummy global"):
 | 
				
			||||||
 | 
					            reg.b32('hist_dummy')
 | 
				
			||||||
 | 
					            op.ld.local.b32(hist_dummy, addr(l_scatter_fake_adr))
 | 
				
			||||||
 | 
					            op.st.volatile.b32(addr(g_hist_dummy), hist_dummy)
 | 
				
			||||||
 | 
					            op.ld.local.b32(hist_dummy, addr(l_scatter_fake_alpha))
 | 
				
			||||||
 | 
					            op.st.volatile.b32(addr(g_hist_dummy), hist_dummy)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def scatter(self, x, y, color, xf_idx, p_valid=None, type='ldst'):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Scatter the given point directly to the histogram bins. I think this
 | 
					        Scatter the given point directly to the histogram bins. I think this
 | 
				
			||||||
        technique has the worst performance of all of 'em. Accesses ``cpA``
 | 
					        technique has the worst performance of all of 'em. Accesses ``cpA``
 | 
				
			||||||
@ -479,6 +489,7 @@ class HistScatter(PTXFragment):
 | 
				
			|||||||
            palette.look_up(r, g, b, a, color, norm_time)
 | 
					            palette.look_up(r, g, b, a, color, norm_time)
 | 
				
			||||||
            # TODO: look up, scale by xform visibility
 | 
					            # TODO: look up, scale by xform visibility
 | 
				
			||||||
            # TODO: Make this more performant
 | 
					            # TODO: Make this more performant
 | 
				
			||||||
 | 
					            if type == 'ldst':
 | 
				
			||||||
                reg.f32('gr gg gb ga')
 | 
					                reg.f32('gr gg gb ga')
 | 
				
			||||||
                op.ld.v4.f32(vec(gr, gg, gb, ga), addr(hist_bin_addr))
 | 
					                op.ld.v4.f32(vec(gr, gg, gb, ga), addr(hist_bin_addr))
 | 
				
			||||||
                op.add.f32(gr, gr, r)
 | 
					                op.add.f32(gr, gr, r)
 | 
				
			||||||
@ -486,7 +497,12 @@ class HistScatter(PTXFragment):
 | 
				
			|||||||
                op.add.f32(gb, gb, b)
 | 
					                op.add.f32(gb, gb, b)
 | 
				
			||||||
                op.add.f32(ga, ga, a)
 | 
					                op.add.f32(ga, ga, a)
 | 
				
			||||||
                op.st.v4.f32(addr(hist_bin_addr), vec(gr, gg, gb, ga))
 | 
					                op.st.v4.f32(addr(hist_bin_addr), vec(gr, gg, gb, ga))
 | 
				
			||||||
            #op.red.add.f32(addr(hist_bin_addr,4*i), val)
 | 
					            elif type == 'red':
 | 
				
			||||||
 | 
					                for i, val in enumerate([r, g, b, a]):
 | 
				
			||||||
 | 
					                    op.red.add.f32(addr(hist_bin_addr,4*i), val)
 | 
				
			||||||
 | 
					            elif type == 'fake':
 | 
				
			||||||
 | 
					                op.st.local.u32(addr(l_scatter_fake_adr), hist_bin_addr)
 | 
				
			||||||
 | 
					                op.st.local.f32(addr(l_scatter_fake_alpha), a)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def call_setup(self, ctx):
 | 
					    def call_setup(self, ctx):
 | 
				
			||||||
        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
 | 
					        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
 | 
				
			||||||
 | 
				
			|||||||
@ -154,7 +154,7 @@ class Animation(object):
 | 
				
			|||||||
        the active device.
 | 
					        the active device.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        # TODO: user-configurable test control
 | 
					        # TODO: user-configurable test control
 | 
				
			||||||
        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
 | 
					        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(28,1),
 | 
				
			||||||
                                 tests=True)
 | 
					                                 tests=True)
 | 
				
			||||||
        # TODO: user-configurable verbosity control
 | 
					        # TODO: user-configurable verbosity control
 | 
				
			||||||
        self.ctx.compile(verbose=3, anim=self, features=self.features)
 | 
					        self.ctx.compile(verbose=3, anim=self, features=self.features)
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user