src/kernels/cumsum.cl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

/*
 * Copyright (C) 2011-2019 Karlsruhe Institute of Technology
 *
 * This file is part of Ufo.
 *
 * This library is free software: you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation, either
 * version 3 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
 */
/* There are 16 memory banks, so divide n by their number, which offsets every
 * thread by n / 16 */
#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4))

void sum_chunk_no_conflicts (global float *input,
                             global float *output,
                             local float *cache,
                             const int height_offset,
                             const int group_offset,
                             const int width,
                             const int local_width,
                             const int group_iterations,
                             const int gx,
                             const int lx)
{
    float tmp;
    int d, ai, bi, offset = 1;
    int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset;

    if (tx_0 - height_offset < width) {
        cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0];
    }
    if (tx_0 + 1 - height_offset < width) {
        cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1];
    }
    barrier (CLK_LOCAL_MEM_FENCE);

    // build sum in place up the tree
    for (d = local_width; d > 0; d >>= 1) {
        if (lx < d) {
            ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
            bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
            cache[bi] += cache[ai];
        }
        offset <<= 1;
        barrier (CLK_LOCAL_MEM_FENCE);
    }

    if (lx == local_width - 1) {
        tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1];
        cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp;
        cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp;
    }

    // traverse down tree & build scan
    for (d = 1; d <= local_width; d <<= 1) {
        offset >>= 1;
        barrier (CLK_LOCAL_MEM_FENCE);
        if (lx < d) {
            ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
            bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
            tmp = cache[ai];
            cache[ai] = cache[bi];
            cache[bi] += tmp;
        }
    }
    barrier (CLK_LOCAL_MEM_FENCE);

    if (tx_0 - height_offset < width) {
        output[tx_0] = cache[bi];
    }
    if (tx_0 + 1 - height_offset < width) {
        output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)];
    }
}

kernel void cumsum (global float *input,
                    global float *output,
                    global float *block_sums,
                    local float *cache,
                    const int group_iterations,
                    const int width)
{
    int local_width = get_local_size (0);
    int gx = get_group_id (0);
    int lx = get_local_id (0);
    int idy = get_global_id (1);
    int height_offset = idy * width;

    for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) {
        sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx);
        barrier (CLK_GLOBAL_MEM_FENCE);
    }
    if (block_sums && lx == 0) {
        block_sums[gx + 2 * local_width * idy] = output[height_offset +
                                                        min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)];
    }
}

kernel void spread_block_sums (global float *output,
                               global float *block_sums,
                               const int group_iterations,
                               const int width)
{
    int local_width = get_local_size (0);
    int gx = get_group_id (0);
    int lx = get_local_id (0);
    int tx = local_width * ((gx + 1) * group_iterations) + lx;
    int idy = get_global_id (1);

    for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) {
        if (tx + group_offset >= width) {
            break;
        }
        output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy];
    }
}