From da30ce619bce168fc24ed0c17f04e411626ba18b Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 20:34:12 +0200 Subject: Eliminate conditionals in the innermost loop to help gcc auto-vectorizer --- src/Core/CMakeLists.txt | 13 +- src/Core/regularisers_CPU/TNV_core.c | 216 +++++++++++---------- src/Core/regularisers_CPU/TNV_core_backtrack.c | 189 +++++++++--------- .../regularisers_CPU/TNV_core_backtrack_loop.h | 100 ++++++++++ src/Core/regularisers_CPU/TNV_core_loop.h | 107 ++++++++++ 5 files changed, 423 insertions(+), 202 deletions(-) create mode 100644 src/Core/regularisers_CPU/TNV_core_backtrack_loop.h create mode 100644 src/Core/regularisers_CPU/TNV_core_loop.h diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 10b16f3..76b0f3e 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message("Adding regularisers as a shared library") #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp") -set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") -#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index dce414a..415c644 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -5,12 +5,6 @@ * * Copyright 2017 Daniil Kazantsev * Copyright 2017 Srikanth Nagella, Edoardo Pasca - * - * Copyriht 2020 Suren A. Chlingaryan - * Optimized version with 1/3 of memory consumption and ~10x performance - * This version is not able to perform back-track except during first iterations - * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) - * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +17,7 @@ * limitations under the License. */ +#include #include "TNV_core.h" #define BLOCK 32 @@ -143,6 +138,7 @@ typedef struct { int offY, stepY, copY; float *Input, *u, *qx, *qy, *gradx, *grady, *div; float *div0, *udiff0; + float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; float resprimal, resdual; float unorm, qnorm, product; } tnv_thread_t; @@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { free(ctx->div0); free(ctx->udiff0); - + + free(ctx->gradxdiff); + free(ctx->gradydiff); + free(ctx->ubarx); + free(ctx->ubary); + free(ctx->udiff); + return 0; } @@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { long DimTotal = (long)(dimX*stepY*padZ); long Dim1Total = (long)(dimX*(stepY+1)*padZ); long DimRow = (long)(dimX * padZ); + long DimCell = (long)(padZ); // Auxiliar vectors ctx->Input = memalign(64, Dim1Total * sizeof(float)); @@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { ctx->div0 = memalign(64, DimRow * sizeof(float)); ctx->udiff0 = memalign(64, DimRow * sizeof(float)); - if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { + ctx->gradxdiff = memalign(64, DimCell * sizeof(float)); + ctx->gradydiff = memalign(64, DimCell * sizeof(float)); + ctx->ubarx = memalign(64, DimCell * sizeof(float)); + ctx->ubary = memalign(64, DimCell * sizeof(float)); + ctx->udiff = memalign(64, DimCell * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { fprintf(stderr, "Error allocating memory\n"); exit(-1); } @@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { - long i, j, k, l; + long i, j, k, l, m; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float constant = 1.0f + taulambda; float resprimal = 0.0f; - float resdual = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; float product = 0.0f; float unorm = 0.0f; float qnorm = 0.0f; @@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float qxdiff; float qydiff; float divdiff; - float gradxdiff[dimZ] __attribute__((aligned(64))); - float gradydiff[dimZ] __attribute__((aligned(64))); - float ubarx[dimZ] __attribute__((aligned(64))); - float ubary[dimZ] __attribute__((aligned(64))); - float udiff[dimZ] __attribute__((aligned(64))); - - - for(i=0; i < dimX; i++) { - for(k = 0; k < dimZ; k++) { - int l = i * padZ + k; - float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; - float udiff = u[l] - u_upd; - ctx->udiff0[l] = udiff; - ctx->div0[l] = div[l]; + float *gradxdiff = ctx->gradxdiff; + float *gradydiff = ctx->gradydiff; + float *ubarx = ctx->ubarx; + float *ubary = ctx->ubary; + float *udiff = ctx->udiff; + + float *udiff0 = ctx->udiff0; + float *div0 = ctx->div0; + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J } - for(int j1 = 0; j1 < stepY; j1 += BLOCK) { - for(int i1 = 0; i1 < dimX; i1 += BLOCK) { - for(int j2 = 0; j2 < BLOCK; j2++) { - j = j1 + j2; - for(int i2 = 0; i2 < BLOCK; i2++) { - float t[3]; - float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - - i = i1 + i2; - if (i == dimX) break; - if (j == stepY) { j2 = BLOCK; break; } - l = (j * dimX + i) * padZ; - -#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; - udiff[k] = u[l + k] - u_upd; - u[l + k] = u_upd; - - float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); - float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); - gradxdiff[k] = gradx[l + k] - gradx_upd; - gradydiff[k] = grady[l + k] - grady_upd; - gradx[l + k] = gradx_upd; - grady[l + k] = grady_upd; - - ubarx[k] = gradx_upd - theta * gradxdiff[k]; - ubary[k] = grady_upd - theta * gradydiff[k]; - - float vx = ubarx[k] + divsigma * qx[l + k]; - float vy = ubary[k] + divsigma * qy[l + k]; - - M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); - } - coefF(t, M1, M2, M3, sigma, p, q, r); - -#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - float vx = ubarx[k] + divsigma * qx[l + k]; - float vy = ubary[k] + divsigma * qy[l + k]; - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qxdiff = sigma * (ubarx[k] - gx_upd); - qydiff = sigma * (ubary[k] - gy_upd); - - qx[l + k] += qxdiff; - qy[l + k] += qydiff; - - unorm += (udiff[k] * udiff[k]); - qnorm += (qxdiff * qxdiff + qydiff * qydiff); - - float div_upd = 0; - div_upd -= (i > 0)?qx[l + k - padZ]:0; - div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; - div_upd += (i < (dimX-1))?qx[l + k]:0; - div_upd += (j < (copY-1))?qy[l + k]:0; - divdiff = div[l + k] - div_upd; - div[l + k] = div_upd; - - resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; - resdual += fabs(divsigma * qxdiff + gradxdiff[k]); - resdual += fabs(divsigma * qydiff + gradydiff[k]); - product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_loop.h" + } } - } // i - } // j - } // i - } // j + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + ctx->resprimal = resprimal; - ctx->resdual = resdual; + ctx->resdual = resdual1 + resdual2; ctx->product = product; ctx->unorm = unorm; ctx->qnorm = qnorm; @@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower -// tnv_ctx.padZ = dimZ; + tnv_ctx.padZ = dimZ; // tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); - tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); @@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to float udiff = ctx->udiff0[l]; ctx->div[l] -= ctx0->qy[l + m]; - ctx0->div[m + l + dimX * padZ] = ctx->div[l]; - ctx0->u[m + l + dimX * padZ] = ctx->u[l]; - + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + ctx0->u[m + l + dimX*padZ] = ctx->u[l]; + divdiff += ctx0->qy[l + m]; resprimal += fabs(divtau * udiff + divdiff); } } } + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + for (j = 0; j < tnv_ctx.threads; j++) { tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; resprimal += ctx->resprimal; @@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to printf("Iterations stopped at %i with the residual %f \n", iter, residual); printf("Return: %f\n", *uT); -// exit(-1); return *uT; } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index 7192475..9b19ed5 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { long i, j, k, l; + long i1, i2, j1, j2; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float theta1 = 1.0f + theta; float constant = 1.0f + taulambda; - float resprimal = 0.0; - float resdual = 0.0; - float product = 0.0; - float unorm = 0.0; - float qnorm = 0.0; + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; float udiff[dimZ] __attribute__((aligned(64))); float qxdiff __attribute__((aligned(64))); @@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float gradxdiff[dimZ] __attribute__((aligned(64))); float gradydiff[dimZ] __attribute__((aligned(64))); - for(int j1 = 0; j1 < stepY; j1 += BLOCK) { - for(int i1 = 0; i1 < dimX; i1 += BLOCK) { - for(int j2 = 0; j2 < BLOCK; j2++) { - j = j1 + j2; - for(int i2 = 0; i2 < BLOCK; i2++) { - float t[3]; - float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - - i = i1 + i2; - if (i == dimX) break; - if (j == stepY) { j2 = BLOCK; break; } - l = (j * dimX + i) * padZ; - -//#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; - udiff[k] = u[l + k] - u_upd[l + k]; - unorm += (udiff[k] * udiff[k]); - - gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); - grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); - gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; - gradydiff[k] = grady[l + k] - grady_upd[l + k]; - - float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; - float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; -//#define TNV_NEW_STYLE -#ifdef TNV_NEW_STYLE - qx_upd[l + k] = qx[l + k] + sigma * ubarx; - qy_upd[l + k] = qy[l + k] + sigma * ubary; - - float vx = divsigma * qx_upd[l + k]; //+ ubarx - float vy = divsigma * qy_upd[l + k]; //+ ubary -#else - float vx = ubarx + divsigma * qx[l + k]; - float vy = ubary + divsigma * qy[l + k]; -#endif - - M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); - } - coefF(t, M1, M2, M3, sigma, p, q, r); - -//#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { -#ifdef TNV_NEW_STYLE - float vx = divsigma * qx_upd[l + k]; - float vy = divsigma * qy_upd[l + k]; - - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qx_upd[l + k] -= sigma * gx_upd; - qy_upd[l + k] -= sigma * gy_upd; -#else - float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; - float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; - float vx = ubarx + divsigma * qx[l + k]; - float vy = ubary + divsigma * qy[l + k]; - - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); - qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); -#endif - - float div_upd_val = 0; - div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; - div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; - div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; - div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; - div_upd[l + k] = div_upd_val; - - qxdiff = qx[l + k] - qx_upd[l + k]; - qydiff = qy[l + k] - qy_upd[l + k]; - qnorm += (qxdiff * qxdiff + qydiff * qydiff); - - resdual += fabs(divsigma * qxdiff - gradxdiff[k]); - resdual += fabs(divsigma * qydiff - gradydiff[k]); - product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); - - if ((offY == 0)||(j > 0)) { - divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? - resprimal += fabs(divtau * udiff[k] + divdiff); - } + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_backtrack_loop.h" + } } - } // i - } // j - } // i - } // j - + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + ctx->resprimal = resprimal; - ctx->resdual = resdual; + ctx->resdual = resdual1 + resdual2; ctx->product = product; ctx->unorm = unorm; ctx->qnorm = qnorm; @@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to } } + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + for (j = 0; j < tnv_ctx.threads; j++) { tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; resprimal += ctx->resprimal; diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h new file mode 100644 index 0000000..3ec4250 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h @@ -0,0 +1,100 @@ + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + l = (j * dimX + i) * padZ; + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); + +#ifdef TNV_LOOP_LAST_I + gradx_upd[l + k] = 0; +#else + gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +#endif + +#ifdef TNV_LOOP_LAST_J + grady_upd[l + k] = 0; +#else + grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + + float div_upd_val = 0; +#ifndef TNV_LOOP_FIRST_I + div_upd_val -= qx_upd[l + k - padZ]; +#endif + +#ifndef TNV_LOOP_FIRST_J + div_upd_val -= qy_upd[l + k - dimX * padZ]; +#endif +#ifndef TNV_LOOP_LAST_I + div_upd_val += qx_upd[l + k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd_val += qy_upd[l + k]; +#endif + div_upd[l + k] = div_upd_val; + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual2 += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +#ifndef TNV_LOOP_FIRST_J + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + } diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h new file mode 100644 index 0000000..34e7139 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_loop.h @@ -0,0 +1,107 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + float *__restrict u_next = u + l + padZ; + float *__restrict u_current = u + l; + float *__restrict u_next_row = u + l + m; + + + float *__restrict qx_current = qx + l; + float *__restrict qy_current = qy + l; + float *__restrict qx_prev = qx + l - padZ; + float *__restrict qy_prev = qy + l - m; + + +// __assume(padZ%16==0); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + float gradx_upd = 0; +#else + float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + float gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + float grady_upd = 0; +#else + float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + float grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + float vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + float div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } + \ No newline at end of file -- cgit v1.2.1