summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2020-03-29 23:13:45 +0200
committerSuren A. Chilingaryan <csa@suren.me>2020-03-29 23:13:45 +0200
commitcd0dd76a48aada43c3066642a8eb327a0374d113 (patch)
treed5ce8cebda9511c5ac6280df54c29e8b631fa1cb
parentfebfe9a6490052d4b8789fd8f7a0342115bfd55e (diff)
downloadregularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.gz
regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.bz2
regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.xz
regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.zip
Add optimization steps for reference
-rw-r--r--src/Core/performance_CPU/README25
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v15731
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v17690
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v18688
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v19681
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v27650
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v32676
-rwxr-xr-xsrc/Core/performance_CPU/TNV_core.c.v4.stdver629
-rw-r--r--src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19100
-rw-r--r--src/Core/performance_CPU/TNV_core_loop.h.v32119
10 files changed, 4989 insertions, 0 deletions
diff --git a/src/Core/performance_CPU/README b/src/Core/performance_CPU/README
new file mode 100644
index 0000000..948c544
--- /dev/null
+++ b/src/Core/performance_CPU/README
@@ -0,0 +1,25 @@
+TNV_core.c.v4.stdver: Fully equivalent (signle-threads). There is two potential breakage points.
+ - If OpenMP is enabled, the acces to div_upd will not be serialized and results will breaj
+ - The results will slightly differ due to different order of summation if loop summing resprimal/resdual organized in a logical way
+TNV_core.c.v15: Multi-threads. Works correctly only in the single-threaded mode (if TNV_NEW_STYLE is disabled). In multi-threaded there results slightly differ due to changed order of operation
+ - TNV_NEW_STYLE slightly disturbs results in both single- and multi-threaded modes
+ - Resprimal/resdual are summed in groups (not sequentially) if multiple threads. But his actually should improve precision. Use TNV_CHECK_RES to check conformance
+ - Afterwards, in multi-threaded moded there is a still minor descripancy which first occurs in resprimal (after a few iterations). This is because of changed order of operations while computing
+ div_upd (only on the first lines of each new sub-block). Normally, we first compute the vertical and, then, add horizontal. On the border rows, instead we first add horizontals...
+ To check, div/div_upd changed to double. There is no difference then.
+TNV_core.c.v17: Computationaly comptabile with v15.
+ - Padding actually harms performance
+ - Intel compiler gives about 10% speed-up
+TNV_core.c.v18: Blocking helps to boost performance further but only with Intel Compiler. Gcc/Clang is slightly slower here.
+ - Padding here doesn't harm performance, but is not helpful either
+ - Difference between icc and gcc is probably due to auto-vectorization.
+ - Results slightly changed due to different order of operations
+TNV_core.c.v19: Eliminate conditionals in the inner loops to help gcc-autovectorisation
+ - Last version implementing full algrotithm with backtrack in the middle of iterations.
+ - Again results slightly diverge from v18 due to different order of operations
+TNV_core.c.v27: v18 with backtracking only on first iterations (otherwise warning reported)
+TNV_core.c.v32: v19 with backtracking only on first iterations (otherwise warning reported)
+
+
+Repo:
+ - Padding seems to have effect on the newer AVX2 systems. Re-enabled.
diff --git a/src/Core/performance_CPU/TNV_core.c.v15 b/src/Core/performance_CPU/TNV_core.c.v15
new file mode 100755
index 0000000..3161cbf
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v15
@@ -0,0 +1,731 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+/*inline*/ void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, realY, copY;
+ float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+ double *div, *div_upd;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->u_upd);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->qx_upd);
+ free(ctx->qy_upd);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->gradx_upd);
+ free(ctx->grady_upd);
+ free(ctx->div);
+ free(ctx->div_upd);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int realY = ctx->realY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*realY*dimZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+ // Auxiliar vectors
+ ctx->Input = calloc(Dim1Total, sizeof(float));
+ ctx->u = calloc(Dim1Total, sizeof(float));
+ ctx->u_upd = calloc(Dim1Total, sizeof(float));
+ ctx->qx = calloc(DimTotal, sizeof(float));
+ ctx->qy = calloc(DimTotal, sizeof(float));
+ ctx->qx_upd = calloc(DimTotal, sizeof(float));
+ ctx->qy_upd = calloc(DimTotal, sizeof(float));
+ ctx->gradx = calloc(DimTotal, sizeof(float));
+ ctx->grady = calloc(DimTotal, sizeof(float));
+ ctx->gradx_upd = calloc(DimTotal, sizeof(float));
+ ctx->grady_upd = calloc(DimTotal, sizeof(float));
+ ctx->div = calloc(Dim1Total, sizeof(double));
+ ctx->div_upd = calloc(Dim1Total, sizeof(double));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int realY = ctx->realY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*realY*dimZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(double));
+ memset(ctx->u_upd, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx_upd, 0, DimTotal * sizeof(float));
+ memset(ctx->qy_upd, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx_upd, 0, DimTotal * sizeof(float));
+ memset(ctx->grady_upd, 0, DimTotal * sizeof(float));
+ memset(ctx->div_upd, 0, Dim1Total * sizeof(double));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * dimZ + i * dimZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * dimZ + i * dimZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int realY = ctx->realY;
+ int copY = ctx->copY;
+
+ long DimTotal = (long)(dimX*realY*dimZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * dimZ + i * dimZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int realY = ctx->realY;
+ long DimTotal = (long)(dimX*realY*dimZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+ memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+ memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+ memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(double));
+
+ return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int realY = ctx->realY;
+ long DimTotal = (long)(dimX*realY*dimZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+ memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+ memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+ memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+ memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+ memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(double));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l, m;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *u_upd = ctx->u_upd;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *qx_upd = ctx->qx_upd;
+ float *qy_upd = ctx->qy_upd;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *gradx_upd = ctx->gradx_upd;
+ float *grady_upd = ctx->grady_upd;
+ double *div = ctx->div;
+ double *div_upd = ctx->div_upd;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float udiff[dimZ];
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float gradxdiff[dimZ];
+ float gradydiff[dimZ];
+
+ for(k=0; k < dimZ * dimX; k++) {
+ u_upd[k] = (u[k] + tau * div[k] + taulambda * Input[k])/constant;
+ div_upd[k] = 0;
+ }
+
+ for(j = 0; j < stepY; j++) {
+/* m = j * dimX * dimZ + (dimX - 1) * dimZ;
+ for(k = 0; k < dimZ; k++) {
+ u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;
+ }*/
+
+ for(i=0; i < (dimX /*- 1*/); i++) {
+ l = (j * dimX + i) * dimZ;
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ m = dimX * dimZ;
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+ gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + dimZ] - u_upd[l + k]);
+ grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + dimX * dimZ] - u_upd[l + k]); // We need div from the next thread on last iter
+
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+ div_upd[l + k] += qx_upd[l + k];
+ div_upd[l + k + dimZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+ div_upd[l + k] += qy_upd[l + k];
+ div_upd[l + k + dimX * dimZ] = -qy_upd[l + k]; // We need to update div in the next thread on last iter
+}
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+ if ((offY == 0)||(j > 0)) {
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+ }
+ }
+
+ } // i
+ }
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+ ctx->realY = ctx->stepY;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (i = 1; i < tnv_ctx.threads; i++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (i - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+
+ l = ctx0->stepY * dimX * dimZ;
+ for(k=0; k < dimZ * (dimX /*- 1*/); k++) {
+ double div_upd_add = ctx0->div_upd[l + k];
+ ctx->div_upd[k] += div_upd_add;
+ ctx0->div_upd[l + k] = ctx->div_upd[k];
+
+// ctx0->u_upd[l + k] = ctx->u_upd[k];
+
+ float divdiff = ctx->div[k] - ctx->div_upd[k]; // Multiple steps... How we compute without history?
+ float udiff = ctx->u[k] - ctx->u_upd[k];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+
+#define TNV_CHECK_RES
+#ifndef TNV_CHECK_RES
+ for (i = 0; i < tnv_ctx.threads; i++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+#else
+ resprimal = 0;
+ float divsigma = 1.0f / sigma;
+ for(j=0; j<dimY; j++)
+ for(i=0; i<dimX; i++)
+ for(int l=0; l<dimZ; l++)
+ {
+ int step = dimY / tnv_ctx.threads;
+ int extra = dimY % tnv_ctx.threads;
+
+ int thr, subj = j;
+ for (thr = 0; thr < tnv_ctx.threads; thr++) {
+ int size = step;
+ if (thr < extra) size++;
+
+ if (subj >= size) subj-= size;
+ else break;
+ }
+
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + thr;
+
+ int k = subj * dimX * dimZ + i * dimZ + l;
+
+ float udiff = ctx->u[k] - ctx->u_upd[k];
+ float qxdiff = ctx->qx[k] - ctx->qx_upd[k];
+ float qydiff = ctx->qy[k] - ctx->qy_upd[k];
+ float divdiff = ctx->div[k] - ctx->div_upd[k];
+ float gradxdiff = ctx->gradx[k] - ctx->gradx_upd[k];
+ float gradydiff = ctx->grady[k] - ctx->grady_upd[k];
+
+ resprimal += fabs(divtau*udiff + divdiff);
+ resdual += fabs(divsigma*qxdiff - gradxdiff);
+ resdual += fabs(divsigma*qydiff - gradydiff);
+
+ unorm += (udiff * udiff);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+ product += (gradxdiff * qxdiff + gradydiff * qydiff);
+ }
+#endif
+
+
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v17 b/src/Core/performance_CPU/TNV_core.c.v17
new file mode 100755
index 0000000..60739cd
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v17
@@ -0,0 +1,690 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+ float *div, *div_upd;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->u_upd);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->qx_upd);
+ free(ctx->qy_upd);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->gradx_upd);
+ free(ctx->grady_upd);
+ free(ctx->div);
+ free(ctx->div_upd);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+ // Auxiliar vectors
+ ctx->Input = malloc(Dim1Total * sizeof(float));
+ ctx->u = malloc(Dim1Total * sizeof(float));
+ ctx->u_upd = malloc(Dim1Total * sizeof(float));
+ ctx->qx = malloc(DimTotal * sizeof(float));
+ ctx->qy = malloc(DimTotal * sizeof(float));
+ ctx->qx_upd = malloc(DimTotal * sizeof(float));
+ ctx->qy_upd = malloc(DimTotal * sizeof(float));
+ ctx->gradx = malloc(DimTotal * sizeof(float));
+ ctx->grady = malloc(DimTotal * sizeof(float));
+ ctx->gradx_upd = malloc(DimTotal * sizeof(float));
+ ctx->grady_upd = malloc(DimTotal * sizeof(float));
+ ctx->div = malloc(Dim1Total * sizeof(float));
+ ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+ memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+ memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+ memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+ memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+ memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+ memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+ memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+ memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l, m;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *u_upd = ctx->u_upd;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *qx_upd = ctx->qx_upd;
+ float *qy_upd = ctx->qy_upd;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *gradx_upd = ctx->gradx_upd;
+ float *grady_upd = ctx->grady_upd;
+ float *div = ctx->div;
+ float *div_upd = ctx->div_upd;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float udiff[dimZ];
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float gradxdiff[dimZ];
+ float gradydiff[dimZ];
+
+ for(i=0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+ u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+ div_upd[l] = 0;
+ }
+ }
+
+ for(j = 0; j < stepY; j++) {
+/* m = j * dimX * dimZ + (dimX - 1) * dimZ;
+ for(k = 0; k < dimZ; k++) {
+ u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;
+ }*/
+
+ for(i = 0; i < dimX/* - 1*/; i++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ l = (j * dimX + i) * padZ;
+ m = dimX * padZ;
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+ gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
+ grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter
+
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma unroll 64
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+ div_upd[l + k] += qx_upd[l + k];
+ div_upd[l + k + padZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+ div_upd[l + k] += qy_upd[l + k];
+ div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter
+}
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+ if ((offY == 0)||(j > 0)) {
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+ }
+ }
+
+ } // i
+ }
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+ tnv_ctx.padZ = dimZ;
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = ctx0->stepY * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float div_upd_add = ctx0->div_upd[m + l];
+ ctx->div_upd[l] += div_upd_add;
+ ctx0->div_upd[m + l] = ctx->div_upd[l];
+ //ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history?
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v18 b/src/Core/performance_CPU/TNV_core.c.v18
new file mode 100755
index 0000000..7192475
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v18
@@ -0,0 +1,688 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+ float *div, *div_upd;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->u_upd);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->qx_upd);
+ free(ctx->qy_upd);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->gradx_upd);
+ free(ctx->grady_upd);
+ free(ctx->div);
+ free(ctx->div_upd);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+ // Auxiliar vectors
+ ctx->Input = memalign(64, Dim1Total * sizeof(float));
+ ctx->u = memalign(64, Dim1Total * sizeof(float));
+ ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+ ctx->qx = memalign(64, DimTotal * sizeof(float));
+ ctx->qy = memalign(64, DimTotal * sizeof(float));
+ ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx = memalign(64, DimTotal * sizeof(float));
+ ctx->grady = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->div = memalign(64, Dim1Total * sizeof(float));
+ ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+ memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+ memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+ memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+ memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+ memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+ memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+ memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+ memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *u_upd = ctx->u_upd;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *qx_upd = ctx->qx_upd;
+ float *qy_upd = ctx->qy_upd;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *gradx_upd = ctx->gradx_upd;
+ float *grady_upd = ctx->grady_upd;
+ float *div = ctx->div;
+ float *div_upd = ctx->div_upd;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0;
+ float resdual = 0.0;
+ float product = 0.0;
+ float unorm = 0.0;
+ float qnorm = 0.0;
+
+ float udiff[dimZ] __attribute__((aligned(64)));
+ float qxdiff __attribute__((aligned(64)));
+ float qydiff __attribute__((aligned(64)));
+ float divdiff __attribute__((aligned(64)));
+ float gradxdiff[dimZ] __attribute__((aligned(64)));
+ float gradydiff[dimZ] __attribute__((aligned(64)));
+
+ for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+ for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+ i = i1 + i2;
+ if (i == dimX) break;
+ if (j == stepY) { j2 = BLOCK; break; }
+ l = (j * dimX + i) * padZ;
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+
+ gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+ grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+ float div_upd_val = 0;
+ div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
+ div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
+ div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
+ div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
+ div_upd[l + k] = div_upd_val;
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+ if ((offY == 0)||(j > 0)) {
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+ }
+ }
+
+ } // i
+ } // j
+ } // i
+ } // j
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+ tnv_ctx.padZ = dimZ;
+// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = ctx0->stepY * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
+ ctx0->div_upd[m + l] = ctx->div_upd[l];
+ ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l];
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v19 b/src/Core/performance_CPU/TNV_core.c.v19
new file mode 100755
index 0000000..9b19ed5
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v19
@@ -0,0 +1,681 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+ float *div, *div_upd;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->u_upd);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->qx_upd);
+ free(ctx->qy_upd);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->gradx_upd);
+ free(ctx->grady_upd);
+ free(ctx->div);
+ free(ctx->div_upd);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+ // Auxiliar vectors
+ ctx->Input = memalign(64, Dim1Total * sizeof(float));
+ ctx->u = memalign(64, Dim1Total * sizeof(float));
+ ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+ ctx->qx = memalign(64, DimTotal * sizeof(float));
+ ctx->qy = memalign(64, DimTotal * sizeof(float));
+ ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx = memalign(64, DimTotal * sizeof(float));
+ ctx->grady = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+ ctx->div = memalign(64, Dim1Total * sizeof(float));
+ ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+ memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+ memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+ memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+ memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ // Auxiliar vectors
+ memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+ memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+ memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+ memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+ memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+ memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l;
+ long i1, i2, j1, j2;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *u_upd = ctx->u_upd;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *qx_upd = ctx->qx_upd;
+ float *qy_upd = ctx->qy_upd;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *gradx_upd = ctx->gradx_upd;
+ float *grady_upd = ctx->grady_upd;
+ float *div = ctx->div;
+ float *div_upd = ctx->div_upd;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual1 = 0.0f;
+ float resdual2 = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float udiff[dimZ] __attribute__((aligned(64)));
+ float qxdiff __attribute__((aligned(64)));
+ float qydiff __attribute__((aligned(64)));
+ float divdiff __attribute__((aligned(64)));
+ float gradxdiff[dimZ] __attribute__((aligned(64)));
+ float gradydiff[dimZ] __attribute__((aligned(64)));
+
+
+ j = 0; {
+# define TNV_LOOP_FIRST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_backtrack_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_FIRST_J
+ }
+
+
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ }
+
+ for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+ for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2 ++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ i = i1 + i2;
+
+ if (i == (dimX - 1)) break;
+ if (j == (copY - 1)) { j2 = BLOCK; break; }
+# include "TNV_core_backtrack_loop.h"
+ }
+ }
+ } // i
+
+ }
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+ }
+
+
+
+ for (j = copY - 1; j < stepY; j++) {
+# define TNV_LOOP_LAST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_backtrack_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_LAST_J
+ }
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual1 + resdual2;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+ tnv_ctx.padZ = dimZ;
+// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = ctx0->stepY * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
+ ctx0->div_upd[m + l] = ctx->div_upd[l];
+ ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l];
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l];
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v27 b/src/Core/performance_CPU/TNV_core.c.v27
new file mode 100755
index 0000000..dce414a
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v27
@@ -0,0 +1,650 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/3 of memory consumption and ~10x performance
+ * This version is not able to perform back-track except during first iterations
+ * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)
+ * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+ float *div0, *udiff0;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->div);
+
+ free(ctx->div0);
+ free(ctx->udiff0);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+ long DimRow = (long)(dimX * padZ);
+
+ // Auxiliar vectors
+ ctx->Input = memalign(64, Dim1Total * sizeof(float));
+ ctx->u = memalign(64, Dim1Total * sizeof(float));
+ ctx->qx = memalign(64, DimTotal * sizeof(float));
+ ctx->qy = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx = memalign(64, DimTotal * sizeof(float));
+ ctx->grady = memalign(64, DimTotal * sizeof(float));
+ ctx->div = memalign(64, Dim1Total * sizeof(float));
+
+ ctx->div0 = memalign(64, DimRow * sizeof(float));
+ ctx->udiff0 = memalign(64, DimRow * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *div = ctx->div;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float gradxdiff[dimZ] __attribute__((aligned(64)));
+ float gradydiff[dimZ] __attribute__((aligned(64)));
+ float ubarx[dimZ] __attribute__((aligned(64)));
+ float ubary[dimZ] __attribute__((aligned(64)));
+ float udiff[dimZ] __attribute__((aligned(64)));
+
+
+ for(i=0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+ float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+ float udiff = u[l] - u_upd;
+ ctx->udiff0[l] = udiff;
+ ctx->div0[l] = div[l];
+ }
+ }
+
+ for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+ for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+ i = i1 + i2;
+ if (i == dimX) break;
+ if (j == stepY) { j2 = BLOCK; break; }
+ l = (j * dimX + i) * padZ;
+
+#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+ udiff[k] = u[l + k] - u_upd;
+ u[l + k] = u_upd;
+
+ float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
+ float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
+ gradxdiff[k] = gradx[l + k] - gradx_upd;
+ gradydiff[k] = grady[l + k] - grady_upd;
+ gradx[l + k] = gradx_upd;
+ grady[l + k] = grady_upd;
+
+ ubarx[k] = gradx_upd - theta * gradxdiff[k];
+ ubary[k] = grady_upd - theta * gradydiff[k];
+
+ float vx = ubarx[k] + divsigma * qx[l + k];
+ float vy = ubary[k] + divsigma * qy[l + k];
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ float vx = ubarx[k] + divsigma * qx[l + k];
+ float vy = ubary[k] + divsigma * qy[l + k];
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qxdiff = sigma * (ubarx[k] - gx_upd);
+ qydiff = sigma * (ubary[k] - gy_upd);
+
+ qx[l + k] += qxdiff;
+ qy[l + k] += qydiff;
+
+ unorm += (udiff[k] * udiff[k]);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ float div_upd = 0;
+ div_upd -= (i > 0)?qx[l + k - padZ]:0;
+ div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
+ div_upd += (i < (dimX-1))?qx[l + k]:0;
+ div_upd += (j < (copY-1))?qy[l + k]:0;
+ divdiff = div[l + k] - div_upd;
+ div[l + k] = div_upd;
+
+ resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0;
+ resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
+ resdual += fabs(divsigma * qydiff + gradydiff[k]);
+ product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+ }
+
+ } // i
+ } // j
+ } // i
+ } // j
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+// tnv_ctx.padZ = dimZ;
+// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+ tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ int started = 0;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = (ctx0->stepY - 1) * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div0[l] - ctx->div[l];
+ float udiff = ctx->udiff0[l];
+
+ ctx->div[l] -= ctx0->qy[l + m];
+ ctx0->div[m + l + dimX * padZ] = ctx->div[l];
+ ctx0->u[m + l + dimX * padZ] = ctx->u[l];
+
+ divdiff += ctx0->qy[l + m];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ if (started) {
+ fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ }
+ } else {
+ started = 1;
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+// exit(-1);
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v32 b/src/Core/performance_CPU/TNV_core.c.v32
new file mode 100755
index 0000000..fcb2f87
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v32
@@ -0,0 +1,676 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrtf(eig1);
+ sig2 = sqrtf(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+ int offY, stepY, copY;
+ float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+ float *div0, *udiff0;
+ float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
+ float resprimal, resdual;
+ float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+ int threads;
+ tnv_thread_t *thr_ctx;
+ float *InputT, *uT;
+ int dimX, dimY, dimZ, padZ;
+ float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ free(ctx->Input);
+ free(ctx->u);
+ free(ctx->qx);
+ free(ctx->qy);
+ free(ctx->gradx);
+ free(ctx->grady);
+ free(ctx->div);
+
+ free(ctx->div0);
+ free(ctx->udiff0);
+
+ free(ctx->gradxdiff);
+ free(ctx->gradydiff);
+ free(ctx->ubarx);
+ free(ctx->ubary);
+ free(ctx->udiff);
+
+ return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+
+// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+ long DimRow = (long)(dimX * padZ);
+ long DimCell = (long)(padZ);
+
+ // Auxiliar vectors
+ ctx->Input = memalign(64, Dim1Total * sizeof(float));
+ ctx->u = memalign(64, Dim1Total * sizeof(float));
+ ctx->qx = memalign(64, DimTotal * sizeof(float));
+ ctx->qy = memalign(64, DimTotal * sizeof(float));
+ ctx->gradx = memalign(64, DimTotal * sizeof(float));
+ ctx->grady = memalign(64, DimTotal * sizeof(float));
+ ctx->div = memalign(64, Dim1Total * sizeof(float));
+
+ ctx->div0 = memalign(64, DimRow * sizeof(float));
+ ctx->udiff0 = memalign(64, DimRow * sizeof(float));
+
+ ctx->gradxdiff = memalign(64, DimCell * sizeof(float));
+ ctx->gradydiff = memalign(64, DimCell * sizeof(float));
+ ctx->ubarx = memalign(64, DimCell * sizeof(float));
+ ctx->ubary = memalign(64, DimCell * sizeof(float));
+ ctx->udiff = memalign(64, DimCell * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+ fprintf(stderr, "Error allocating memory\n");
+ exit(-1);
+ }
+
+ return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<copY; j++) {
+ for(i=0; i<dimX; i++) {
+ ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+ ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<stepY; j++) {
+ for(i=0; i<dimX; i++) {
+ tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+ }
+ }
+ }
+
+ return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+ int i,j,k;
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+ int padZ = tnv_ctx->padZ;
+ long DimTotal = (long)(dimX*stepY*padZ);
+ long Dim1Total = (long)(dimX*copY*padZ);
+
+ memset(ctx->u, 0, Dim1Total * sizeof(float));
+ memset(ctx->qx, 0, DimTotal * sizeof(float));
+ memset(ctx->qy, 0, DimTotal * sizeof(float));
+ memset(ctx->gradx, 0, DimTotal * sizeof(float));
+ memset(ctx->grady, 0, DimTotal * sizeof(float));
+ memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+ return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+ long i, j, k, l, m;
+
+ tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+ tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+ int dimX = tnv_ctx->dimX;
+ int dimY = tnv_ctx->dimY;
+ int dimZ = tnv_ctx->dimZ;
+ int padZ = tnv_ctx->padZ;
+ int offY = ctx->offY;
+ int stepY = ctx->stepY;
+ int copY = ctx->copY;
+
+ float *Input = ctx->Input;
+ float *u = ctx->u;
+ float *qx = ctx->qx;
+ float *qy = ctx->qy;
+ float *gradx = ctx->gradx;
+ float *grady = ctx->grady;
+ float *div = ctx->div;
+
+ long p = 1l;
+ long q = 1l;
+ long r = 0l;
+
+ float lambda = tnv_ctx->lambda;
+ float sigma = tnv_ctx->sigma;
+ float tau = tnv_ctx->tau;
+ float theta = tnv_ctx->theta;
+
+ float taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+ float constant = 1.0f + taulambda;
+
+ float resprimal = 0.0f;
+ float resdual1 = 0.0f;
+ float resdual2 = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float qxdiff;
+ float qydiff;
+ float divdiff;
+ float *gradxdiff = ctx->gradxdiff;
+ float *gradydiff = ctx->gradydiff;
+ float *ubarx = ctx->ubarx;
+ float *ubary = ctx->ubary;
+ float *udiff = ctx->udiff;
+
+ float *udiff0 = ctx->udiff0;
+ float *div0 = ctx->div0;
+
+/*
+ for(i=0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+ float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+ float udiff_val = u[l] - u_upd;
+ udiff0[l] = udiff_val;
+ div0[l] = div[l];
+ }
+ }
+*/
+
+
+ j = 0; {
+# define TNV_LOOP_FIRST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_FIRST_J
+ }
+
+
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ }
+
+#define BLOCK 32
+ for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+ for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2 ++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ i = i1 + i2;
+
+ if (i == (dimX - 1)) break;
+ if (j == (copY - 1)) { j2 = BLOCK; break; }
+# include "TNV_core_loop.h"
+ }
+ }
+ } // i
+
+ }
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+ }
+
+
+
+ for (j = copY - 1; j < stepY; j++) {
+# define TNV_LOOP_LAST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_LAST_J
+ }
+
+
+
+ ctx->resprimal = resprimal;
+ ctx->resdual = resdual1 + resdual2;
+ ctx->product = product;
+ ctx->unorm = unorm;
+ ctx->qnorm = qnorm;
+
+ return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+ int i, off, size, err;
+
+ if (sched) return;
+
+ tnv_ctx.dimX = dimX;
+ tnv_ctx.dimY = dimY;
+ tnv_ctx.dimZ = dimZ;
+ // Padding seems actually slower
+// tnv_ctx.padZ = dimZ;
+// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+ tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+
+ hw_sched_init();
+
+ int threads = hw_sched_get_cpu_count();
+ if (threads > dimY) threads = dimY/2;
+
+ int step = dimY / threads;
+ int extra = dimY % threads;
+
+ tnv_ctx.threads = threads;
+ tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+ for (i = 0, off = 0; i < threads; i++, off += size) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+ size = step + ((i < extra)?1:0);
+
+ ctx->offY = off;
+ ctx->stepY = size;
+
+ if (i == (threads-1)) ctx->copY = ctx->stepY;
+ else ctx->copY = ctx->stepY + 1;
+ }
+
+ sched = hw_sched_create(threads);
+ if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ int err;
+ int iter;
+ int i,j,k,l,m;
+
+ lambda = 1.0f/(2.0f*lambda);
+ tnv_ctx.lambda = lambda;
+
+ // PDHG algorithm parameters
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+ tnv_ctx.InputT = InputT;
+ tnv_ctx.uT = uT;
+
+ int padZ = tnv_ctx.padZ;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ float residual = fLarge;
+ int started = 0;
+ for(iter = 0; iter < maxIter; iter++) {
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ float divtau = 1.0f / tau;
+
+ tnv_ctx.sigma = sigma;
+ tnv_ctx.tau = tau;
+ tnv_ctx.theta = theta;
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+ // border regions
+ for (j = 1; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+ m = (ctx0->stepY - 1) * dimX * padZ;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div0[l] - ctx->div[l];
+ float udiff = ctx->udiff0[l];
+
+ ctx->div[l] -= ctx0->qy[l + m];
+ ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+ ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+
+ divdiff += ctx0->qy[l + m];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div0[l] - ctx->div[l];
+ float udiff = ctx->udiff0[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
+ for (j = 0; j < tnv_ctx.threads; j++) {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+ resprimal += ctx->resprimal;
+ resdual += ctx->resdual;
+ product += ctx->product;
+ unorm += ctx->unorm;
+ qnorm += ctx->qnorm;
+ }
+
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+ if(b > 1) {
+
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ if (started) {
+ fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+ } else {
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+ }
+ } else {
+ started = 1;
+ if(resprimal > dual_dot_delta) {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+ } else if(resprimal < dual_div_delta) {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+ }
+
+ if (residual < tol) break;
+ }
+
+ err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+ if (!err) err = hw_sched_wait_task(sched);
+ if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ printf("Return: %f\n", *uT);
+
+ return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v4.stdver b/src/Core/performance_CPU/TNV_core.c.v4.stdver
new file mode 100755
index 0000000..6be60ae
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v4.stdver
@@ -0,0 +1,629 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+ int ii, num;
+ float divsigma = 1.0f / sigma;
+ float sum, shrinkfactor;
+ float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+ float proj[2] = {0};
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrt(eig1);
+ sig2 = sqrt(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+ long i, j, k, p, q, r, DimTotal;
+ float taulambda;
+ float *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd, *div, *div_upd;
+
+ p = 1l;
+ q = 1l;
+ r = 0l;
+
+ lambda = 1.0f/(2.0f*lambda);
+ DimTotal = (long)(dimX*dimY*dimZ);
+ /* PDHG algorithm parameters*/
+ float tau = 0.5f;
+ float sigma = 0.5f;
+ float theta = 1.0f;
+
+ // Auxiliar vectors
+ u_upd = calloc(DimTotal, sizeof(float));
+ qx = calloc(DimTotal, sizeof(float));
+ qy = calloc(DimTotal, sizeof(float));
+ qx_upd = calloc(DimTotal, sizeof(float));
+ qy_upd = calloc(DimTotal, sizeof(float));
+ gradx = calloc(DimTotal, sizeof(float));
+ grady = calloc(DimTotal, sizeof(float));
+ gradx_upd = calloc(DimTotal, sizeof(float));
+ grady_upd = calloc(DimTotal, sizeof(float));
+ div = calloc(DimTotal, sizeof(float));
+ div_upd = calloc(DimTotal, sizeof(float));
+
+
+ float *Input = calloc(DimTotal, sizeof(float));
+ float *u = calloc(DimTotal, sizeof(float));
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<dimY; j++) {
+ for(i=0; i<dimX; i++) {
+ Input[j * dimX * dimZ + i * dimZ + k] = InputT[k * dimX * dimY + j * dimX + i];
+ u[j * dimX * dimZ + i * dimZ + k] = uT[k * dimX * dimY + j * dimX + i];
+ }
+ }
+ }
+
+
+ // Backtracking parameters
+ float s = 1.0f;
+ float gamma = 0.75f;
+ float beta = 0.95f;
+ float alpha0 = 0.2f;
+ float alpha = alpha0;
+ float delta = 1.5f;
+ float eta = 0.95f;
+
+ // PDHG algorithm parameters
+ taulambda = tau * lambda;
+ float divtau = 1.0f / tau;
+ float divsigma = 1.0f / sigma;
+ float theta1 = 1.0f + theta;
+
+ /*allocate memory for taulambda */
+ //taulambda = (float*) calloc(dimZ, sizeof(float));
+ //for(k=0; k < dimZ; k++) {taulambda[k] = tau*lambda[k];}
+
+ // Apply Primal-Dual Hybrid Gradient scheme
+ int iter = 0;
+ float residual = fLarge;
+
+ for(iter = 0; iter < maxIter; iter++) {
+ // Argument of proximal mapping of fidelity term
+ // Proximal solution of fidelity term
+ // proxG(u_upd, u, div, Input, tau, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ));
+ float constant = 1.0f + taulambda;
+ #pragma omp parallel for shared(Input, u, u_upd) private(k)
+ for(k=0; k<dimZ*dimX*dimY; k++) {
+ float v = u[k] + tau * div[k];
+ u_upd[k] = (v + taulambda * Input[k])/constant;
+ }
+
+ memset(div_upd, 0, dimX*dimY*dimZ*sizeof(float));
+ // This changes results, I guess access to div_upd is violated
+// #pragma omp parallel for shared (gradx_upd,grady_upd,gradx,grady,qx,qy,qx_upd,qy_upd) private(i,j,k)
+ for(j=0; j<dimY; j++) {
+ for(i=0; i<dimX; i++) {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ int l = (j * dimX + i) * dimZ;
+
+ for(k = 0; k < dimZ; k++) {
+ if(i != dimX-1)
+ gradx_upd[l + k] = u_upd[l + k + dimZ] - u_upd[l + k];
+ else
+ gradx_upd[l + k] = 0.0f;
+
+ if(j != dimY-1)
+ grady_upd[l + k] = u_upd[l + k + dimX * dimZ] - u_upd[l + k];
+ else
+ grady_upd[l + k] = 0.0f;
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+ for(k = 0; k < dimZ; k++) {
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+
+ if(i != dimX-1) {
+ div_upd[l + k] += qx_upd[l + k];
+ div_upd[l + k + dimZ] -= qx_upd[l + k];
+ }
+
+ if(j != dimY-1) {
+ div_upd[l + k] += qy_upd[l + k];
+ div_upd[l + k + dimX * dimZ] -= qy_upd[l + k];
+ }
+ }
+ }
+ }
+
+// Compute primal residual, dual residual, and backtracking condition
+ float resprimal = 0.0f;
+ float resdual = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
+
+ // If this loop is inner, the result slightly changed due to different summation order
+ for(int l=0; l<dimZ; l++)
+ for(j=0; j<dimY; j++)
+ for(i=0; i<dimX; i++)
+ {
+// for(k=0; k<dimX*dimY*dimZ; k++) {
+ int k = j * dimX * dimZ + i * dimZ + l;
+
+ float udiff = u[k] - u_upd[k];
+ float qxdiff = qx[k] - qx_upd[k];
+ float qydiff = qy[k] - qy_upd[k];
+ float divdiff = div[k] - div_upd[k];
+ float gradxdiff = gradx[k] - gradx_upd[k];
+ float gradydiff = grady[k] - grady_upd[k];
+
+ resprimal += fabs(divtau*udiff + divdiff);
+ resdual += fabs(divsigma*qxdiff - gradxdiff);
+ resdual += fabs(divsigma*qydiff - gradydiff);
+
+ unorm += (udiff * udiff);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+ product += (gradxdiff * qxdiff + gradydiff * qydiff);
+ }
+
+ float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm +
+ gamma * tau * qnorm);
+
+// printf("resprimal: %f, resdual: %f, b: %f\n", resprimal, resdual, b);
+ printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+// printf("b: %f\n", b);
+
+// Adapt step-size parameters
+ float dual_dot_delta = resdual * s * delta;
+ float dual_div_delta = (resdual * s) / delta;
+
+ if(b > 1)
+ {
+ // Decrease step-sizes to fit balancing principle
+ tau = (beta * tau) / b;
+ sigma = (beta * sigma) / b;
+ alpha = alpha0;
+
+ copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+ } else if(resprimal > dual_dot_delta)
+ {
+ // Increase primal step-size and decrease dual step-size
+ tau = tau / (1.0f - alpha);
+ sigma = sigma * (1.0f - alpha);
+ alpha = alpha * eta;
+
+ } else if(resprimal < dual_div_delta)
+ {
+ // Decrease primal step-size and increase dual step-size
+ tau = tau * (1.0f - alpha);
+ sigma = sigma / (1.0f - alpha);
+ alpha = alpha * eta;
+ }
+
+// Update variables
+ taulambda = tau * lambda;
+//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k];
+
+ divsigma = 1.0f / sigma;
+ divtau = 1.0f / tau;
+
+ copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ));
+ copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+// Compute residual at current iteration
+ residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+
+// printf("%f \n", residual);
+ if (residual < tol) {
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+ break;
+ }
+
+ }
+ printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+
+ free (u_upd);
+ free(qx);
+ free(qy);
+ free(qx_upd);
+ free(qy_upd);
+ free(gradx);
+ free(grady);
+ free(gradx_upd);
+ free(grady_upd);
+ free(div);
+ free(div_upd);
+ printf("Return: %f\n", *u);
+
+ for(k=0; k<dimZ; k++) {
+ for(j=0; j<dimY; j++) {
+ for(i=0; i<dimX; i++) {
+ uT[k * dimX * dimY + j * dimX + i] = u[j * dimX * dimZ + i * dimZ + k];
+ }
+ }
+ }
+
+
+
+
+ return *u;
+}
+
+float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ)
+{
+ float constant;
+ long k;
+ constant = 1.0f + taulambda;
+ #pragma omp parallel for shared(v, f, u_upd) private(k)
+ for(k=0; k<dimZ*dimX*dimY; k++) {
+ u_upd[k] = (v[k] + taulambda * f[k])/constant;
+ //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant;
+ }
+ return *u_upd;
+}
+
+float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ)
+{
+ long i, j, k, l;
+ // Compute discrete gradient using forward differences
+ #pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l)
+ for(k = 0; k < dimZ; k++) {
+ for(j = 0; j < dimY; j++) {
+ l = j * dimX;
+ for(i = 0; i < dimX; i++) {
+ // Derivatives in the x-direction
+ if(i != dimX-1)
+ gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l];
+ else
+ gradx_upd[(dimX*dimY)*k + i+l] = 0.0f;
+
+ // Derivatives in the y-direction
+ if(j != dimY-1)
+ //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l];
+ grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l];
+ else
+ grady_upd[(dimX*dimY)*k + i+l] = 0.0f;
+ }
+ }
+ }
+ return 1;
+}
+
+float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ)
+{
+ // (S^p, \ell^1) norm decouples at each pixel
+// Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim);
+ float divsigma = 1.0f / sigma;
+
+ // $\ell^{1,1,1}$-TV regularization
+// int i,j,k;
+// #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k)
+// for(k = 0; k < dimZ; k++) {
+// for(i=0; i<dimX; i++) {
+// for(j=0; j<dimY; j++) {
+// gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma, 0.0f);
+// gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma, 0.0f);
+// }}}
+
+ // Auxiliar vector
+ float *proj, sum, shrinkfactor ;
+ float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3;
+ long i,j,k, ii, num;
+ #pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3)
+ for(i=0; i<dimX; i++) {
+ for(j=0; j<dimY; j++) {
+
+ proj = (float*) calloc (2,sizeof(float));
+ // Compute matrix $M\in\R^{2\times 2}$
+ M1 = 0.0f;
+ M2 = 0.0f;
+ M3 = 0.0f;
+
+ for(k = 0; k < dimZ; k++)
+ {
+ valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)];
+ valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)];
+
+ M1 += (valuex * valuex);
+ M2 += (valuex * valuey);
+ M3 += (valuey * valuey);
+ }
+
+ // Compute eigenvalues of M
+ T = M1 + M3;
+ D = M1 * M3 - M2 * M2;
+ det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
+ eig1 = MAX((T / 2.0f) + det, 0.0f);
+ eig2 = MAX((T / 2.0f) - det, 0.0f);
+ sig1 = sqrt(eig1);
+ sig2 = sqrt(eig2);
+
+ // Compute normalized eigenvectors
+ V1 = V2 = V3 = V4 = 0.0f;
+
+ if(M2 != 0.0f)
+ {
+ v0 = M2;
+ v1 = eig1 - M3;
+ v2 = eig2 - M3;
+
+ mu1 = sqrtf(v0 * v0 + v1 * v1);
+ mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+ if(mu1 > fTiny)
+ {
+ V1 = v1 / mu1;
+ V3 = v0 / mu1;
+ }
+
+ if(mu2 > fTiny)
+ {
+ V2 = v2 / mu2;
+ V4 = v0 / mu2;
+ }
+
+ } else
+ {
+ if(M1 > M3)
+ {
+ V1 = V4 = 1.0f;
+ V2 = V3 = 0.0f;
+
+ } else
+ {
+ V1 = V4 = 0.0f;
+ V2 = V3 = 1.0f;
+ }
+ }
+
+ // Compute prox_p of the diagonal entries
+ sig1_upd = sig2_upd = 0.0f;
+
+ if(p == 1)
+ {
+ sig1_upd = MAX(sig1 - divsigma, 0.0f);
+ sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+ } else if(p == INFNORM)
+ {
+ proj[0] = sigma * fabs(sig1);
+ proj[1] = sigma * fabs(sig2);
+
+ /*l1 projection part */
+ sum = fLarge;
+ num = 0l;
+ shrinkfactor = 0.0f;
+ while(sum > 1.0f)
+ {
+ sum = 0.0f;
+ num = 0;
+
+ for(ii = 0; ii < 2; ii++)
+ {
+ proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+ sum += fabs(proj[ii]);
+ if(proj[ii]!= 0.0f)
+ num++;
+ }
+
+ if(num > 0)
+ shrinkfactor = (sum - 1.0f) / num;
+ else
+ break;
+ }
+ /*l1 proj ends*/
+
+ sig1_upd = sig1 - divsigma * proj[0];
+ sig2_upd = sig2 - divsigma * proj[1];
+ }
+
+ // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+ if(sig1 > fTiny)
+ sig1_upd /= sig1;
+
+ if(sig2 > fTiny)
+ sig2_upd /= sig2;
+
+ // Compute solution
+ t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+ t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+ t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+
+ for(k = 0; k < dimZ; k++)
+ {
+ gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2;
+ gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3;
+ }
+
+ // Delete allocated memory
+ free(proj);
+ }
+ }
+
+ return 1;
+}
+
+float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ)
+{
+ long i, j, k, l;
+ #pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l)
+ for(k = 0; k < dimZ; k++) {
+ for(j = 0; j < dimY; j++) {
+ l = j * dimX;
+ for(i = 0; i < dimX; i++) {
+ if(i != dimX-1)
+ {
+ // ux[k][i+l] = u[k][i+1+l] - u[k][i+l]
+ div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l];
+ div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l];
+ }
+
+ if(j != dimY-1)
+ {
+ // uy[k][i+l] = u[k][i+width+l] - u[k][i+l]
+ //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l];
+ div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l];
+ div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l];
+ }
+ }
+ }
+ }
+ return *div_upd;
+}
diff --git a/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19
new file mode 100644
index 0000000..3ec4250
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19
@@ -0,0 +1,100 @@
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+ l = (j * dimX + i) * padZ;
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+
+#ifdef TNV_LOOP_LAST_I
+ gradx_upd[l + k] = 0;
+#else
+ gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+ grady_upd[l + k] = 0;
+#else
+ grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+#endif
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+ float div_upd_val = 0;
+#ifndef TNV_LOOP_FIRST_I
+ div_upd_val -= qx_upd[l + k - padZ];
+#endif
+
+#ifndef TNV_LOOP_FIRST_J
+ div_upd_val -= qy_upd[l + k - dimX * padZ];
+#endif
+#ifndef TNV_LOOP_LAST_I
+ div_upd_val += qx_upd[l + k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+ div_upd_val += qy_upd[l + k];
+#endif
+ div_upd[l + k] = div_upd_val;
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual2 += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+#ifndef TNV_LOOP_FIRST_J
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+#endif
+ }
diff --git a/src/Core/performance_CPU/TNV_core_loop.h.v32 b/src/Core/performance_CPU/TNV_core_loop.h.v32
new file mode 100644
index 0000000..be53156
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core_loop.h.v32
@@ -0,0 +1,119 @@
+ {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ l = (j * dimX + i) * padZ;
+ m = dimX * padZ;
+
+ float *__restrict u_next = u + l + padZ;
+ float *__restrict u_current = u + l;
+ float *__restrict u_next_row = u + l + m;
+
+
+ float *__restrict qx_current = qx + l;
+ float *__restrict qy_current = qy + l;
+ float *__restrict qx_prev = qx + l - padZ;
+ float *__restrict qy_prev = qy + l - m;
+
+
+ __assume(padZ%16==0);
+/*
+ __assume_aligned(Input, 64);
+ __assume_aligned(div, 64);
+ __assume_aligned(gradx, 64);
+ __assume_aligned(grady, 64);
+ __assume_aligned(u, 64);
+ __assume_aligned(qx, 64);
+ __assume_aligned(qy, 64);
+ __assume_aligned(u_current, 64);
+ __assume_aligned(u_next, 64);
+ __assume_aligned(u_next_row, 64);
+*/
+
+#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+ udiff[k] = u[l + k] - u_upd; // cache 1w
+ u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+ udiff0[l + k] = udiff[k];
+ div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+ float gradx_upd = 0;
+#else
+ float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+ float gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+ float grady_upd = 0;
+#else
+ float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+ float grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+ gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+ gradx[l + k] = gradx_upd; // 1 write
+ grady[l + k] = grady_upd; // 1 write
+
+ ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+ ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+ float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+ float vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < padZ; k++) {
+ float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+ float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qxdiff = sigma * (ubarx[k] - gx_upd);
+ qydiff = sigma * (ubary[k] - gy_upd);
+
+ qx_current[k] += qxdiff; // write 1
+ qy_current[k] += qydiff; // write 1
+
+ unorm += (udiff[k] * udiff[k]);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ float div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+ div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+ div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+ div_upd += qx_current[k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+ div_upd += qy_current[k];
+#endif
+
+ divdiff = div[l + k] - div_upd; // 1 read
+ div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+#endif
+
+ // We need to have two independent accumulators to allow gcc-autovectorization
+ resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+ resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+ product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+ }
+ }
+ \ No newline at end of file