summaryrefslogtreecommitdiffstats
path: root/src/Core/regularisers_GPU/TV_PD_GPU_core.cu
diff options
context:
space:
mode:
Diffstat (limited to 'src/Core/regularisers_GPU/TV_PD_GPU_core.cu')
-rw-r--r--src/Core/regularisers_GPU/TV_PD_GPU_core.cu522
1 files changed, 522 insertions, 0 deletions
diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.cu b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu
new file mode 100644
index 0000000..59fda3b
--- /dev/null
+++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu
@@ -0,0 +1,522 @@
+/*
+This work is part of the Core Imaging Library developed by
+Visual Analytics and Imaging System Group of the Science Technology
+Facilities Council, STFC
+
+Copyright 2017 Daniil Kazantsev
+Copyright 2017 Srikanth Nagella, Edoardo Pasca
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "TV_PD_GPU_core.h"
+#include "shared.h"
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
+
+/* CUDA implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume
+ * 2. lambdaPar - regularization parameter
+ * 3. Number of iterations
+ * 4. eplsilon: tolerance constant
+ * 5. lipschitz_const: convergence related parameter
+ * 6. TV-type: methodTV - 'iso' (0) or 'l1' (1)
+ * 7. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON)
+ * 8. tau: time marching parameter
+
+ * Output:
+ * [1] TV - Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+ *
+ * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010
+ */
+
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
+#define BLKXSIZE 8
+#define BLKYSIZE 8
+#define BLKZSIZE 8
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+// struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+
+/************************************************/
+/*****************2D modules*********************/
+/************************************************/
+
+__global__ void dualPD_kernel(float *U, float *P1, float *P2, float sigma, int N, int M)
+{
+
+ //calculate each thread global index
+ const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+ const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if ((xIndex < N) && (yIndex < M)) {
+ if (xIndex == N-1) P1[index] += sigma*(U[(xIndex-1) + N*yIndex] - U[index]);
+ else P1[index] += sigma*(U[(xIndex+1) + N*yIndex] - U[index]);
+ if (yIndex == M-1) P2[index] += sigma*(U[xIndex + N*(yIndex-1)] - U[index]);
+ else P2[index] += sigma*(U[xIndex + N*(yIndex+1)] - U[index]);
+ }
+ return;
+}
+__global__ void Proj_funcPD2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+
+ float denom;
+ //calculate each thread global index
+ const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+ const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if ((xIndex < N) && (yIndex < M)) {
+ denom = pow(P1[index],2) + pow(P2[index],2);
+ if (denom > 1.0f) {
+ P1[index] = P1[index]/sqrt(denom);
+ P2[index] = P2[index]/sqrt(denom);
+ }
+ }
+ return;
+}
+__global__ void Proj_funcPD2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize)
+{
+
+ float val1, val2;
+ //calculate each thread global index
+ const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+ const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if ((xIndex < N) && (yIndex < M)) {
+ val1 = abs(P1[index]);
+ val2 = abs(P2[index]);
+ if (val1 < 1.0f) {val1 = 1.0f;}
+ if (val2 < 1.0f) {val2 = 1.0f;}
+ P1[index] = P1[index]/val1;
+ P2[index] = P2[index]/val2;
+ }
+ return;
+}
+__global__ void DivProj2D_kernel(float *U, float *Input, float *P1, float *P2, float lt, float tau, int N, int M)
+{
+ float P_v1, P_v2, div_var;
+
+ //calculate each thread global index
+ const int xIndex=blockIdx.x*blockDim.x+threadIdx.x;
+ const int yIndex=blockIdx.y*blockDim.y+threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if ((xIndex < N) && (yIndex < M)) {
+ if (xIndex == 0) P_v1 = -P1[index];
+ else P_v1 = -(P1[index] - P1[(xIndex-1) + N*yIndex]);
+ if (yIndex == 0) P_v2 = -P2[index];
+ else P_v2 = -(P2[index] - P2[xIndex + N*(yIndex-1)]);
+ div_var = P_v1 + P_v2;
+ U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt);
+ }
+ return;
+}
+__global__ void PDnonneg2D_kernel(float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ if (Output[index] < 0.0f) Output[index] = 0.0f;
+ }
+}
+/************************************************/
+/*****************3D modules*********************/
+/************************************************/
+__global__ void dualPD3D_kernel(float *U, float *P1, float *P2, float *P3, float sigma, int N, int M, int Z)
+{
+
+ //calculate each thread global index
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if ((i < N) && (j < M) && (k < Z)) {
+ if (i == N-1) P1[index] += sigma*(U[(N*M)*k + (i-1) + N*j] - U[index]);
+ else P1[index] += sigma*(U[(N*M)*k + (i+1) + N*j] - U[index]);
+ if (j == M-1) P2[index] += sigma*(U[(N*M)*k + i + N*(j-1)] - U[index]);
+ else P2[index] += sigma*(U[(N*M)*k + i + N*(j+1)] - U[index]);
+ if (k == Z-1) P3[index] += sigma*(U[(N*M)*(k-1) + i + N*j] - U[index]);
+ else P3[index] += sigma*(U[(N*M)*(k+1) + i + N*j] - U[index]);
+ }
+ return;
+}
+__global__ void Proj_funcPD3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+
+ float denom,sq_denom;
+ //calculate each thread global index
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if ((i < N) && (j < M) && (k < Z)) {
+ denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2);
+ if (denom > 1.0f) {
+ sq_denom = 1.0f/sqrt(denom);
+ P1[index] *= sq_denom;
+ P2[index] *= sq_denom;
+ P3[index] *= sq_denom;
+ }
+ }
+ return;
+}
+__global__ void Proj_funcPD3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize)
+{
+
+ float val1, val2, val3;
+ //calculate each thread global index
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if ((i < N) && (j < M) && (k < Z)) {
+ val1 = abs(P1[index]);
+ val2 = abs(P2[index]);
+ val3 = abs(P3[index]);
+ if (val1 < 1.0f) {val1 = 1.0f;}
+ if (val2 < 1.0f) {val2 = 1.0f;}
+ if (val3 < 1.0f) {val3 = 1.0f;}
+ P1[index] /= val1;
+ P2[index] /= val2;
+ P3[index] /= val3;
+ }
+ return;
+}
+__global__ void DivProj3D_kernel(float *U, float *Input, float *P1, float *P2, float *P3, float lt, float tau, int N, int M, int Z)
+{
+ float P_v1, P_v2, P_v3, div_var;
+
+ //calculate each thread global index
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if ((i < N) && (j < M) && (k < Z)) {
+ if (i == 0) P_v1 = -P1[index];
+ else P_v1 = -(P1[index] - P1[(N*M)*k + (i-1) + N*j]);
+ if (j == 0) P_v2 = -P2[index];
+ else P_v2 = -(P2[index] - P2[(N*M)*k + i + N*(j-1)]);
+ if (k == 0) P_v3 = -P3[index];
+ else P_v3 = -(P3[index] - P3[(N*M)*(k-1) + i + N*j]);
+ div_var = P_v1 + P_v2 + P_v3;
+ U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt);
+ }
+ return;
+}
+
+__global__ void PDnonneg3D_kernel(float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ if (Output[index] < 0.0f) Output[index] = 0.0f;
+ }
+}
+__global__ void PDcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void PDcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void getU2D_kernel(float *Input, float *Input_old, float theta, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Input[index] += theta*(Input[index] - Input_old[index]);
+ }
+}
+
+__global__ void getU3D_kernel(float *Input, float *Input_old, float theta, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Input[index] += theta*(Input[index] - Input_old[index]);
+ }
+}
+
+__global__ void PDResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
+__global__ void PDResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
+
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+
+////////////MAIN HOST FUNCTION ///////////////
+extern "C" int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ)
+{
+ int deviceCount = -1; // number of devices
+ cudaGetDeviceCount(&deviceCount);
+ if (deviceCount == 0) {
+ fprintf(stderr, "No CUDA devices found\n");
+ return -1;
+ }
+ int count = 0, i;
+ float re, sigma, theta, lt;
+ re = 0.0f;
+
+ sigma = 1.0/(lipschitz_const*tau);
+ theta = 1.0f;
+ lt = tau/lambdaPar;
+
+ if (dimZ <= 1) {
+ /*2D verson*/
+ int ImSize = dimX*dimY;
+ float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL;
+
+ dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+ dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+
+ /*allocate space for images on device*/
+ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+
+ checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+ cudaMemset(P1, 0, ImSize*sizeof(float));
+ cudaMemset(P2, 0, ImSize*sizeof(float));
+
+ /********************** Run CUDA 2D kernel here ********************/
+ /* The main kernel */
+ for (i = 0; i < iter; i++) {
+
+ /* computing the the dual P variable */
+ dualPD_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, sigma, dimX, dimY);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ if (nonneg != 0) {
+ PDnonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() ); }
+
+ /* projection step */
+ if (methodTV == 0) Proj_funcPD2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/
+ else Proj_funcPD2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ /* copy U to U_old */
+ PDcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ /* calculate divergence */
+ DivProj2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, lt, tau, dimX, dimY);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ if ((epsil != 0.0f) && (i % 5 == 0)) {
+ /* calculate norm - stopping rules using the Thrust library */
+ PDResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ // setup arguments
+ square<float> unary_op;
+ thrust::plus<float> binary_op;
+ thrust::device_vector<float> d_vec(P1, P1 + ImSize);
+ float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));
+ thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+ float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));
+
+ // compute norm
+ re = (reduction/reduction2);
+ if (re < epsil) count++;
+ if (count > 3) break;
+ }
+
+ getU2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+ }
+ //copy result matrix from device to host memory
+ cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+
+ cudaFree(d_input);
+ cudaFree(d_update);
+ cudaFree(d_old);
+ cudaFree(P1);
+ cudaFree(P2);
+
+ }
+ else {
+ /*3D verson*/
+ int ImSize = dimX*dimY*dimZ;
+ float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL;
+
+ dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+ dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE));
+
+ /*allocate space for images on device*/
+ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) );
+
+ checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+ cudaMemset(P1, 0, ImSize*sizeof(float));
+ cudaMemset(P2, 0, ImSize*sizeof(float));
+ cudaMemset(P3, 0, ImSize*sizeof(float));
+ /********************** Run CUDA 3D kernel here ********************/
+ for (i = 0; i < iter; i++) {
+
+ /* computing the the dual P variable */
+ dualPD3D_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, P3, sigma, dimX, dimY, dimZ);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ if (nonneg != 0) {
+ PDnonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() ); }
+
+ /* projection step */
+ if (methodTV == 0) Proj_funcPD3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*isotropic TV*/
+ else Proj_funcPD3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*anisotropic TV*/
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ /* copy U to U_old */
+ PDcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, dimZ, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ /* calculate divergence */
+ DivProj3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, P3, lt, tau, dimX, dimY, dimZ);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ if ((epsil != 0.0f) && (i % 5 == 0)) {
+ /* calculate norm - stopping rules using the Thrust library */
+ PDResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, dimZ, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+
+ // setup arguments
+ square<float> unary_op;
+ thrust::plus<float> binary_op;
+ thrust::device_vector<float> d_vec(P1, P1 + ImSize);
+ float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));
+ thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+ float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));
+
+ // compute norm
+ re = (reduction/reduction2);
+ if (re < epsil) count++;
+ if (count > 3) break;
+ }
+
+ /* get U*/
+ getU3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, dimZ, ImSize);
+ checkCudaErrors( cudaDeviceSynchronize() );
+ checkCudaErrors(cudaPeekAtLastError() );
+ }
+ /***************************************************************/
+ //copy result matrix from device to host memory
+ cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
+
+ cudaFree(d_input);
+ cudaFree(d_update);
+ cudaFree(d_old);
+ cudaFree(P1);
+ cudaFree(P2);
+ cudaFree(P3);
+
+ }
+ //cudaDeviceReset();
+ /*adding info into info_vector */
+ infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/
+ infovector[1] = re; /* reached tolerance */
+ return 0;
+}