diff options
Diffstat (limited to 'src/Core/regularisers_GPU/TV_PD_GPU_core.cu')
-rw-r--r-- | src/Core/regularisers_GPU/TV_PD_GPU_core.cu | 522 |
1 files changed, 522 insertions, 0 deletions
diff --git a/src/Core/regularisers_GPU/TV_PD_GPU_core.cu b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu new file mode 100644 index 0000000..59fda3b --- /dev/null +++ b/src/Core/regularisers_GPU/TV_PD_GPU_core.cu @@ -0,0 +1,522 @@ +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "TV_PD_GPU_core.h" +#include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> + +/* CUDA implementation of Primal-Dual TV [1] by Chambolle Pock denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. lipschitz_const: convergence related parameter + * 6. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 7. nonneg: 'nonnegativity (0 is OFF by default, 1 is ON) + * 8. tau: time marching parameter + + * Output: + * [1] TV - Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * + * [1] Antonin Chambolle, Thomas Pock. "A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging", 2010 + */ + +#define BLKXSIZE2D 16 +#define BLKYSIZE2D 16 + +#define BLKXSIZE 8 +#define BLKYSIZE 8 +#define BLKZSIZE 8 + +#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; + +/************************************************/ +/*****************2D modules*********************/ +/************************************************/ + +__global__ void dualPD_kernel(float *U, float *P1, float *P2, float sigma, int N, int M) +{ + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex == N-1) P1[index] += sigma*(U[(xIndex-1) + N*yIndex] - U[index]); + else P1[index] += sigma*(U[(xIndex+1) + N*yIndex] - U[index]); + if (yIndex == M-1) P2[index] += sigma*(U[xIndex + N*(yIndex-1)] - U[index]); + else P2[index] += sigma*(U[xIndex + N*(yIndex+1)] - U[index]); + } + return; +} +__global__ void Proj_funcPD2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float denom; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); + if (denom > 1.0f) { + P1[index] = P1[index]/sqrt(denom); + P2[index] = P2[index]/sqrt(denom); + } + } + return; +} +__global__ void Proj_funcPD2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float val1, val2; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + } + return; +} +__global__ void DivProj2D_kernel(float *U, float *Input, float *P1, float *P2, float lt, float tau, int N, int M) +{ + float P_v1, P_v2, div_var; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex == 0) P_v1 = -P1[index]; + else P_v1 = -(P1[index] - P1[(xIndex-1) + N*yIndex]); + if (yIndex == 0) P_v2 = -P2[index]; + else P_v2 = -(P2[index] - P2[xIndex + N*(yIndex-1)]); + div_var = P_v1 + P_v2; + U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt); + } + return; +} +__global__ void PDnonneg2D_kernel(float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +/************************************************/ +/*****************3D modules*********************/ +/************************************************/ +__global__ void dualPD3D_kernel(float *U, float *P1, float *P2, float *P3, float sigma, int N, int M, int Z) +{ + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i == N-1) P1[index] += sigma*(U[(N*M)*k + (i-1) + N*j] - U[index]); + else P1[index] += sigma*(U[(N*M)*k + (i+1) + N*j] - U[index]); + if (j == M-1) P2[index] += sigma*(U[(N*M)*k + i + N*(j-1)] - U[index]); + else P2[index] += sigma*(U[(N*M)*k + i + N*(j+1)] - U[index]); + if (k == Z-1) P3[index] += sigma*(U[(N*M)*(k-1) + i + N*j] - U[index]); + else P3[index] += sigma*(U[(N*M)*(k+1) + i + N*j] - U[index]); + } + return; +} +__global__ void Proj_funcPD3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float denom,sq_denom; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] *= sq_denom; + P2[index] *= sq_denom; + P3[index] *= sq_denom; + } + } + return; +} +__global__ void Proj_funcPD3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float val1, val2, val3; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + val3 = abs(P3[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} + P1[index] /= val1; + P2[index] /= val2; + P3[index] /= val3; + } + return; +} +__global__ void DivProj3D_kernel(float *U, float *Input, float *P1, float *P2, float *P3, float lt, float tau, int N, int M, int Z) +{ + float P_v1, P_v2, P_v3, div_var; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i == 0) P_v1 = -P1[index]; + else P_v1 = -(P1[index] - P1[(N*M)*k + (i-1) + N*j]); + if (j == 0) P_v2 = -P2[index]; + else P_v2 = -(P2[index] - P2[(N*M)*k + i + N*(j-1)]); + if (k == 0) P_v3 = -P3[index]; + else P_v3 = -(P3[index] - P3[(N*M)*(k-1) + i + N*j]); + div_var = P_v1 + P_v2 + P_v3; + U[index] = (U[index] - tau*div_var + lt*Input[index])/(1.0 + lt); + } + return; +} + +__global__ void PDnonneg3D_kernel(float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +__global__ void PDcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void PDcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void getU2D_kernel(float *Input, float *Input_old, float theta, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Input[index] += theta*(Input[index] - Input_old[index]); + } +} + +__global__ void getU3D_kernel(float *Input, float *Input_old, float theta, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Input[index] += theta*(Input[index] - Input_old[index]); + } +} + +__global__ void PDResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void PDResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + + +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +////////////MAIN HOST FUNCTION /////////////// +extern "C" int TV_PD_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, float lipschitz_const, int methodTV, int nonneg, float tau, int dimX, int dimY, int dimZ) +{ + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + int count = 0, i; + float re, sigma, theta, lt; + re = 0.0f; + + sigma = 1.0/(lipschitz_const*tau); + theta = 1.0f; + lt = tau/lambdaPar; + + if (dimZ <= 1) { + /*2D verson*/ + int ImSize = dimX*dimY; + float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL; + + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + + /********************** Run CUDA 2D kernel here ********************/ + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the the dual P variable */ + dualPD_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, sigma, dimX, dimY); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + PDnonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /* projection step */ + if (methodTV == 0) Proj_funcPD2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ + else Proj_funcPD2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* copy U to U_old */ + PDcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* calculate divergence */ + DivProj2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, lt, tau, dimX, dimY); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + PDResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + + getU2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_old); + cudaFree(P1); + cudaFree(P2); + + } + else { + /*3D verson*/ + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update, *d_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_old,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P3, 0, ImSize*sizeof(float)); + /********************** Run CUDA 3D kernel here ********************/ + for (i = 0; i < iter; i++) { + + /* computing the the dual P variable */ + dualPD3D_kernel<<<dimGrid,dimBlock>>>(d_update, P1, P2, P3, sigma, dimX, dimY, dimZ); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + PDnonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /* projection step */ + if (methodTV == 0) Proj_funcPD3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*isotropic TV*/ + else Proj_funcPD3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* copy U to U_old */ + PDcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_old, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* calculate divergence */ + DivProj3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, P1, P2, P3, lt, tau, dimX, dimY, dimZ); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + PDResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, P1, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + } + + /* get U*/ + getU3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_old, theta, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_old); + cudaFree(P1); + cudaFree(P2); + cudaFree(P3); + + } + //cudaDeviceReset(); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; +} |