diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 0825dbf..acc6af9 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -351,7 +351,7 @@ void iter( *accbuf = pix; {{elif info.acc_mode == 'deferred'}} // 'color' gets the top 9 bits. TODO: add dithering via precalc. - uint32_t icolor = cc * 512.0f; + uint32_t icolor = fminf(1.0f, cc) * 511.0f; asm("bfi.b32 %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor)); *log = i; {{endif}} @@ -377,8 +377,8 @@ __device__ void write_shmem_helper( float4 *acc, const int glo_idx, - const int dr, - const int gb + const uint32_t dr, + const uint32_t gb ) { float4 pix = acc[glo_idx]; pix.x += (dr & 0xffff) / 255.0f; @@ -461,7 +461,7 @@ write_shmem( bfe_decl(shr_addr, entry, 0, SHAB); bfe_decl(color, entry, 23, 9); - float colorf = color / 512.0f; + float colorf = color / 511.0f; float4 outcol = tex2D(palTex, colorf, time); // TODO: change texture sampler to return shorts and avoid this diff --git a/cuburn/render.py b/cuburn/render.py index 1834237..32aa68b 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -246,15 +246,14 @@ class Renderer(object): for i in range(nrounds): iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos, block=(32, self._iter.NTHREADS/32, 1), - grid=(ntemporal_samples, 1), - texrefs=[tref], stream=iter_stream) + grid=(ntemporal_samples, 1), stream=iter_stream) _sync_stream(write_stream, iter_stream) sorter.sort(d_log_sorted, d_log, log_size, start_bit, True, stream=write_stream) _sync_stream(iter_stream, write_stream) write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift, block=(1024, 1, 1), grid=(nwriteblocks, 1), - stream=write_stream) + texrefs=[tref], stream=write_stream) else: iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos, block=(32, self._iter.NTHREADS/32, 1),