From e71a8422e5980dd1206f3fa81ea4dcd3ea2fed56 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 10 Sep 2010 18:45:32 -0400 Subject: [PATCH] Make store_per_thread reuse gtid in multiple calls when possible --- cuburn/device_code.py | 4 ++-- cuburn/ptx.py | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/cuburn/device_code.py b/cuburn/device_code.py index b851582..dd399d4 100644 --- a/cuburn/device_code.py +++ b/cuburn/device_code.py @@ -132,8 +132,8 @@ class IterThread(PTXEntryPoint): label('all_cps_done') # TODO this is for testing, move it to a debug statement - std.store_per_thread(g_num_rounds, num_rounds) - std.store_per_thread(g_num_writes, num_writes) + std.store_per_thread(g_num_rounds, num_rounds, + g_num_writes, num_writes) @instmethod def upload_cp_stream(self, ctx, cp_stream, num_cps): diff --git a/cuburn/ptx.py b/cuburn/ptx.py index 6755f52..136eced 100644 --- a/cuburn/ptx.py +++ b/cuburn/ptx.py @@ -689,18 +689,22 @@ class _PTXStdLib(PTXFragment): op.mad.lo.u32(dst, cta, ncta, tid) @ptx_func - def store_per_thread(self, base, val): + def store_per_thread(self, *args): """Store b32 at `base+gtid*4`. Super-common debug pattern.""" - with block("Per-thread store of %s" % str(val)): + with block("Per-thread storing values"): reg.u32('spt_base spt_offset') - op.mov.u32(spt_base, base) self.get_gtid(spt_offset) - op.mad.lo.u32(spt_base, spt_offset, 4, spt_base) - if isinstance(val, float): - # Turn a constant float into the big-endian PTX binary float - # representation, 0fXXXXXXXX (where XX is hex byte) - val = '0f%x%x%x%x' % reversed(map(ord, struct.pack('f', val))) - op.st.b32(addr(spt_base), val) + op.mul.lo.u32(spt_offset, spt_offset, 4) + for i in range(0, len(args), 2): + base, val = args[i], args[i+1] + op.mov.u32(spt_base, base) + op.add.u32(spt_base, spt_base, spt_offset) + if isinstance(val, float): + # Turn a constant float into the big-endian PTX binary f32 + # representation, 0fXXXXXXXX (where XX is hex byte) + val = '0f%x%x%x%x' % reversed(map(ord, + struct.pack('f', val))) + op.st.b32(addr(spt_base), val) @ptx_func def set_is_first_thread(self, p_dst):