diff options
Diffstat (limited to 'patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h')
-rw-r--r-- | patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h new file mode 100644 index 0000000..86299cd --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h @@ -0,0 +1,107 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + floatxx *__restrict u_next = u + l + padZ; + floatxx *__restrict u_current = u + l; + floatxx *__restrict u_next_row = u + l + m; + + + floatxx *__restrict qx_current = qx + l; + floatxx *__restrict qy_current = qy + l; + floatxx *__restrict qx_prev = qx + l - padZ; + floatxx *__restrict qy_prev = qy + l - m; + + +// __assume(padZ%16==0); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + floatyy gradx_upd = 0; +#else + floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + floatyy grady_upd = 0; +#else + floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + floatyy gx_upd = vx * t[0] + vy * t[1]; + floatyy gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + floatyy div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } +
\ No newline at end of file |