diff options
Diffstat (limited to 'src/kernels/cumsum.cl')
-rw-r--r-- | src/kernels/cumsum.cl | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/src/kernels/cumsum.cl b/src/kernels/cumsum.cl new file mode 100644 index 0000000..4d88b93 --- /dev/null +++ b/src/kernels/cumsum.cl @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2011-2019 Karlsruhe Institute of Technology + * + * This file is part of Ufo. + * + * This library is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + */ +/* There are 16 memory banks, so divide n by their number, which offsets every + * thread by n / 16 */ +#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4)) + +void sum_chunk_no_conflicts (global float *input, + global float *output, + local float *cache, + const int height_offset, + const int group_offset, + const int width, + const int local_width, + const int group_iterations, + const int gx, + const int lx) +{ + float tmp; + int d, ai, bi, offset = 1; + int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset; + + if (tx_0 - height_offset < width) { + cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0]; + } + if (tx_0 + 1 - height_offset < width) { + cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1]; + } + barrier (CLK_LOCAL_MEM_FENCE); + + // build sum in place up the tree + for (d = local_width; d > 0; d >>= 1) { + if (lx < d) { + ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1); + bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1); + cache[bi] += cache[ai]; + } + offset <<= 1; + barrier (CLK_LOCAL_MEM_FENCE); + } + + if (lx == local_width - 1) { + tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1]; + cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp; + cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp; + } + + // traverse down tree & build scan + for (d = 1; d <= local_width; d <<= 1) { + offset >>= 1; + barrier (CLK_LOCAL_MEM_FENCE); + if (lx < d) { + ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1); + bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1); + tmp = cache[ai]; + cache[ai] = cache[bi]; + cache[bi] += tmp; + } + } + barrier (CLK_LOCAL_MEM_FENCE); + + if (tx_0 - height_offset < width) { + output[tx_0] = cache[bi]; + } + if (tx_0 + 1 - height_offset < width) { + output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)]; + } +} + +kernel void cumsum (global float *input, + global float *output, + global float *block_sums, + local float *cache, + const int group_iterations, + const int width) +{ + int local_width = get_local_size (0); + int gx = get_group_id (0); + int lx = get_local_id (0); + int idy = get_global_id (1); + int height_offset = idy * width; + + for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) { + sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx); + barrier (CLK_GLOBAL_MEM_FENCE); + } + if (block_sums && lx == 0) { + block_sums[gx + 2 * local_width * idy] = output[height_offset + + min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)]; + } +} + +kernel void spread_block_sums (global float *output, + global float *block_sums, + const int group_iterations, + const int width) +{ + int local_width = get_local_size (0); + int gx = get_group_id (0); + int lx = get_local_id (0); + int tx = local_width * ((gx + 1) * group_iterations) + lx; + int idy = get_global_id (1); + + for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) { + if (tx + group_offset >= width) { + break; + } + output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy]; + } +} |