summaryrefslogtreecommitdiffstats
path: root/src/kernels/stacked-backproject.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/stacked-backproject.cl')
-rw-r--r--src/kernels/stacked-backproject.cl381
1 files changed, 381 insertions, 0 deletions
diff --git a/src/kernels/stacked-backproject.cl b/src/kernels/stacked-backproject.cl
new file mode 100644
index 0000000..d6fff5f
--- /dev/null
+++ b/src/kernels/stacked-backproject.cl
@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2011-2013 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+constant sampler_t volumeSampler_single = CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP |
+ CLK_FILTER_LINEAR;
+
+constant sampler_t volumeSampler_half = CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP |
+ CLK_FILTER_NEAREST;
+
+constant sampler_t volumeSampler_int8 = CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP_TO_EDGE |
+ CLK_FILTER_NEAREST;
+
+kernel void
+interleave_single ( global float *sinogram,
+ write_only image2d_array_t interleaved_sinograms)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+
+ int sinogram_offset = idz*2;
+
+ float x = sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey];
+ float y = sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey];
+
+ write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(x,y,0.0f,0.0f));
+}
+
+kernel void
+texture_single (
+ read_only image2d_array_t sinogram,
+ global float2 *reconstructed_buffer,
+ constant float *sin_lut,
+ constant float *cos_lut,
+ const unsigned int x_offset,
+ const unsigned int y_offset,
+ const unsigned int angle_offset,
+ const unsigned int n_projections,
+ const float axis_pos,
+ unsigned long size){
+
+ const int local_idx = get_local_id(0);
+ const int local_idy = get_local_id(1);
+
+ const int global_idx = get_global_id(0);
+ const int global_idy = get_global_id(1);
+ const int idz = get_global_id(2);
+
+ int local_sizex = get_local_size(0);
+ int local_sizey = get_local_size(1);
+
+ int global_sizex = get_global_size(0);
+ int global_sizey = get_global_size(1);
+
+ /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+ int square = local_idy%4;
+ int quadrant = local_idx/4;
+ int pixel = local_idx%4;
+
+ /* Computing projection and pixel offsets */
+ int projection_index = local_idy/4;
+
+ int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)),
+ (2* (quadrant/2) + (pixel/2))};
+
+ int2 remapped_index_global = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x),
+ (get_group_id(1)*get_local_size(1)+remapped_index_local.y)};
+
+ float2 pixel_coord = {(remapped_index_global.x - axis_pos + x_offset + 0.5f),
+ (remapped_index_global.y - axis_pos + y_offset+0.5f)}; //bx and by
+
+ float2 sum[4] = {0.0f,0.0f};
+ __local float2 shared_mem[64][4];
+ __local float2 reconstructed_cache[16][16];
+
+
+ for(int proj = projection_index; proj < n_projections; proj+=4) {
+ float sine_value = sin_lut[angle_offset + proj];
+ float h = pixel_coord.x * cos_lut[angle_offset + proj] - pixel_coord.y * sin_lut[angle_offset + proj] + axis_pos;
+ for(int q=0; q<4; q+=1){
+ sum[q] += read_imagef(sinogram, volumeSampler_single, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0)).xy;
+ }
+ }
+
+ int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
+
+ for(int q=0; q<4;q+=1){
+ /* Moving partial sums to shared memory */
+ shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
+
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+
+ for(int i=2; i>=1; i/=2){
+ if(remapped_index.x <i){
+ shared_mem[remapped_index.y][remapped_index.x] += shared_mem[remapped_index.y][remapped_index.x+i];
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE); // syncthreads
+ }
+
+ if(remapped_index.x == 0){
+ reconstructed_cache[4*q+remapped_index.y/16][remapped_index.y%16] = shared_mem[remapped_index.y][0];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+ }
+
+ reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections;
+}
+
+kernel void
+uninterleave_single (global float2 *reconstructed_buffer,
+ global float *output)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+ int output_offset = idz*2;
+
+ output[idx + idy*sizex + (output_offset)*sizex*sizey] = reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].x;
+ output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].y;
+}
+
+kernel void
+interleave_half (global float *sinogram,
+ write_only image2d_array_t interleaved_sinograms)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+
+ int sinogram_offset = idz*4;
+
+ float4 b = {sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey],
+ sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey],
+ sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey],
+ sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey]};
+
+ // At each pixel, pack 4 slices in Z-projection
+ write_imagef(interleaved_sinograms, (int4)(idx, idy, idz, 0),(float4)(b));
+}
+
+kernel void
+texture_half (
+ read_only image2d_array_t sinogram,
+ global float4 *reconstructed_buffer,
+ constant float *sin_lut,
+ constant float *cos_lut,
+ const unsigned int x_offset,
+ const unsigned int y_offset,
+ const unsigned int angle_offset,
+ const unsigned int n_projections,
+ const float axis_pos,
+ unsigned long size){
+
+ const int local_idx = get_local_id(0);
+ const int local_idy = get_local_id(1);
+
+ const int global_idx = get_global_id(0);
+ const int global_idy = get_global_id(1);
+ const int idz = get_global_id(2);
+
+ int local_sizex = get_local_size(0);
+ int local_sizey = get_local_size(1);
+
+ int global_sizex = get_global_size(0);
+ int global_sizey = get_global_size(1);
+
+ /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+ int square = local_idy%4;
+ int quadrant = local_idx/4;
+ int pixel = local_idx%4;
+
+ /* Computing projection and pixel offsets */
+ int projection_index = local_idy/4;
+ int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))};
+ int2 remapped_index_global = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x),
+ (get_group_id(1)*get_local_size(1)+remapped_index_local.y)};
+
+
+ float2 pixel_coord = {(remapped_index_global.x-axis_pos+x_offset+0.5f), (remapped_index_global.y-axis_pos+y_offset+0.5f)}; //bx and by
+ float4 sum[4] = {0.0f,0.0f,0.0f,0.0f};
+ __local float4 shared_mem[64][4];
+ __local float4 reconstructed_cache[16][16];
+
+
+ for(int proj = projection_index; proj < n_projections; proj+=4) {
+ float sine_value = sin_lut[angle_offset + proj];
+ float h = pixel_coord.x * cos_lut[angle_offset + proj] - pixel_coord.y * sin_lut[angle_offset + proj] + axis_pos;
+ for(int q=0; q<4; q+=1){
+ sum[q] += read_imagef(sinogram, volumeSampler_half, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0));
+ }
+ }
+
+ int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
+
+ for(int q=0; q<4;q+=1){
+ /* Moving partial sums to shared memory */
+ shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
+
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+
+ for(int i=2; i>=1; i/=2){
+ if(remapped_index.x <i){
+ shared_mem[remapped_index.y][remapped_index.x] += shared_mem[remapped_index.y][remapped_index.x+i];
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE); // syncthreads
+ }
+
+ if(remapped_index.x == 0){
+ reconstructed_cache[4*q+remapped_index.y/16][remapped_index.y%16] = shared_mem[remapped_index.y][0];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+ }
+ reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx] * M_PI_F / n_projections;
+}
+
+kernel void
+uninterleave_half (global float4 *reconstructed_buffer,
+ global float *output)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+
+ int output_offset = idz*4;
+
+ float4 b = reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey];
+
+ output[idx + idy*sizex + (output_offset)*sizex*sizey] = b.x;
+ output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = b.y;
+ output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = b.z;
+ output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = b.w;
+}
+
+kernel void
+interleave_uint (global float *sinogram,
+ write_only image2d_array_t interleaved_sinograms,
+ const float min,
+ const float max)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+
+ int sinogram_offset = idz*4;
+
+ const float scale = 255.0f / (max - min);
+
+ uint4 b = {(sinogram[idx + idy * sizex + (sinogram_offset) * sizex * sizey] - min)*scale,
+ (sinogram[idx + idy * sizex + (sinogram_offset+1) * sizex * sizey] - min)*scale,
+ (sinogram[idx + idy * sizex + (sinogram_offset+2) * sizex * sizey] - min)*scale,
+ (sinogram[idx + idy * sizex + (sinogram_offset+3) * sizex * sizey] - min)*scale};
+
+ write_imageui(interleaved_sinograms, (int4)(idx, idy, idz, 0),(uint4)(b));
+}
+
+kernel void
+texture_uint (
+ read_only image2d_array_t sinogram,
+ global uint4 *reconstructed_buffer,
+ constant float *sin_lut,
+ constant float *cos_lut,
+ const unsigned int x_offset,
+ const unsigned int y_offset,
+ const unsigned int angle_offset,
+ const unsigned int n_projections,
+ const float axis_pos,
+ unsigned long size){
+
+ const int local_idx = get_local_id(0);
+ const int local_idy = get_local_id(1);
+
+ const int global_idx = get_global_id(0);
+ const int global_idy = get_global_id(1);
+ const int idz = get_global_id(2);
+
+ int local_sizex = get_local_size(0);
+ int local_sizey = get_local_size(1);
+
+ int global_sizex = get_global_size(0);
+ int global_sizey = get_global_size(1);
+
+ /* Computing sequential numbers of 4x4 square, quadrant, and pixel within quadrant */
+ int square = local_idy%4;
+ int quadrant = local_idx/4;
+ int pixel = local_idx%4;
+
+ /* Computing projection and pixel offsets */
+ int projection_index = local_idy/4;
+ int2 remapped_index_local = {(4*square + 2*(quadrant%2) + (pixel%2)),(2* (quadrant/2) + (pixel/2))};
+ int2 remapped_index_global = {(get_group_id(0)*get_local_size(0)+remapped_index_local.x),
+ (get_group_id(1)*get_local_size(1)+remapped_index_local.y)};
+
+ float2 pixel_coord = {(remapped_index_global.x-axis_pos+x_offset+0.5f), (remapped_index_global.y-axis_pos+y_offset+0.5f)}; //bx and by
+
+ uint4 sum[4] = {0,0,0,0};
+ __local uint4 shared_mem[64][4];
+ __local uint4 reconstructed_cache[16][16];
+
+ for(int proj = projection_index; proj < n_projections; proj+=4) {
+ float sine_value = sin_lut[angle_offset + proj];
+ float h = pixel_coord.x * cos_lut[angle_offset + proj] - pixel_coord.y * sin_lut[angle_offset + proj] + axis_pos;
+ for(int q=0; q<4; q+=1){
+ sum[q] += read_imageui(sinogram, volumeSampler_int8, (float4)(h-4*q*sine_value, proj + 0.5f,idz, 0.0));
+ }
+ }
+
+
+ int2 remapped_index = {(local_idx%4), (4*local_idy + (local_idx/4))};
+
+ for(int q=0; q<4;q+=1){
+ /* Moving partial sums to shared memory */
+ shared_mem[(local_sizex*remapped_index_local.y + remapped_index_local.x)][projection_index] = sum[q];
+
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+
+ for(int i=2; i>=1; i/=2){
+ if(remapped_index.x <i){
+ shared_mem[remapped_index.y][remapped_index.x] += shared_mem[remapped_index.y][remapped_index.x+i];
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE); // syncthreads
+ }
+
+ if(remapped_index.x == 0){
+ reconstructed_cache[4*q+remapped_index.y/16][remapped_index.y%16] = shared_mem[remapped_index.y][0];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE); // syncthreads
+ }
+
+ reconstructed_buffer[global_idx + global_idy*size + idz*size*size] = reconstructed_cache[local_idy][local_idx];
+}
+
+kernel void
+uninterleave_uint (global uint4 *reconstructed_buffer,
+ global float *output,
+ const float min,
+ const float max,
+ const unsigned int n_projections)
+{
+ const int idx = get_global_id(0);
+ const int idy = get_global_id(1);
+ const int idz = get_global_id(2);
+ const int sizex = get_global_size(0);
+ const int sizey = get_global_size(1);
+
+ int output_offset = idz*4;
+ float scale = (max-min)/255.0f;
+
+ output[idx + idy*sizex + (output_offset)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].x)*scale+min)* M_PI_F / n_projections;
+ output[idx + idy*sizex + (output_offset+1)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].y)*scale+min)* M_PI_F / n_projections;
+ output[idx + idy*sizex + (output_offset+2)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].z)*scale+min)* M_PI_F / n_projections;
+ output[idx + idy*sizex + (output_offset+3)*sizex*sizey] = ((reconstructed_buffer[idx + idy*sizex + idz*sizex*sizey].w)*scale+min)* M_PI_F / n_projections;
+} \ No newline at end of file