summaryrefslogtreecommitdiffstats
path: root/src/kernels/cumsum.cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernels/cumsum.cl')
-rw-r--r--src/kernels/cumsum.cl125
1 files changed, 125 insertions, 0 deletions
diff --git a/src/kernels/cumsum.cl b/src/kernels/cumsum.cl
new file mode 100644
index 0000000..4d88b93
--- /dev/null
+++ b/src/kernels/cumsum.cl
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2011-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+/* There are 16 memory banks, so divide n by their number, which offsets every
+ * thread by n / 16 */
+#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4))
+
+void sum_chunk_no_conflicts (global float *input,
+ global float *output,
+ local float *cache,
+ const int height_offset,
+ const int group_offset,
+ const int width,
+ const int local_width,
+ const int group_iterations,
+ const int gx,
+ const int lx)
+{
+ float tmp;
+ int d, ai, bi, offset = 1;
+ int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset;
+
+ if (tx_0 - height_offset < width) {
+ cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0];
+ }
+ if (tx_0 + 1 - height_offset < width) {
+ cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1];
+ }
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ // build sum in place up the tree
+ for (d = local_width; d > 0; d >>= 1) {
+ if (lx < d) {
+ ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+ bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+ cache[bi] += cache[ai];
+ }
+ offset <<= 1;
+ barrier (CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (lx == local_width - 1) {
+ tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1];
+ cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp;
+ cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp;
+ }
+
+ // traverse down tree & build scan
+ for (d = 1; d <= local_width; d <<= 1) {
+ offset >>= 1;
+ barrier (CLK_LOCAL_MEM_FENCE);
+ if (lx < d) {
+ ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+ bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+ tmp = cache[ai];
+ cache[ai] = cache[bi];
+ cache[bi] += tmp;
+ }
+ }
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ if (tx_0 - height_offset < width) {
+ output[tx_0] = cache[bi];
+ }
+ if (tx_0 + 1 - height_offset < width) {
+ output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)];
+ }
+}
+
+kernel void cumsum (global float *input,
+ global float *output,
+ global float *block_sums,
+ local float *cache,
+ const int group_iterations,
+ const int width)
+{
+ int local_width = get_local_size (0);
+ int gx = get_group_id (0);
+ int lx = get_local_id (0);
+ int idy = get_global_id (1);
+ int height_offset = idy * width;
+
+ for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) {
+ sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx);
+ barrier (CLK_GLOBAL_MEM_FENCE);
+ }
+ if (block_sums && lx == 0) {
+ block_sums[gx + 2 * local_width * idy] = output[height_offset +
+ min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)];
+ }
+}
+
+kernel void spread_block_sums (global float *output,
+ global float *block_sums,
+ const int group_iterations,
+ const int width)
+{
+ int local_width = get_local_size (0);
+ int gx = get_group_id (0);
+ int lx = get_local_id (0);
+ int tx = local_width * ((gx + 1) * group_iterations) + lx;
+ int idy = get_global_id (1);
+
+ for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) {
+ if (tx + group_offset >= width) {
+ break;
+ }
+ output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy];
+ }
+}