1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
/*
* Copyright (C) 2011-2019 Karlsruhe Institute of Technology
*
* This file is part of Ufo.
*
* This library is free software: you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see <http://www.gnu.org/licenses/>.
*/
/* There are 16 memory banks, so divide n by their number, which offsets every
* thread by n / 16 */
#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4))
void sum_chunk_no_conflicts (global float *input,
global float *output,
local float *cache,
const int height_offset,
const int group_offset,
const int width,
const int local_width,
const int group_iterations,
const int gx,
const int lx)
{
float tmp;
int d, ai, bi, offset = 1;
int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset;
if (tx_0 - height_offset < width) {
cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0];
}
if (tx_0 + 1 - height_offset < width) {
cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1];
}
barrier (CLK_LOCAL_MEM_FENCE);
// build sum in place up the tree
for (d = local_width; d > 0; d >>= 1) {
if (lx < d) {
ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
cache[bi] += cache[ai];
}
offset <<= 1;
barrier (CLK_LOCAL_MEM_FENCE);
}
if (lx == local_width - 1) {
tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1];
cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp;
cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp;
}
// traverse down tree & build scan
for (d = 1; d <= local_width; d <<= 1) {
offset >>= 1;
barrier (CLK_LOCAL_MEM_FENCE);
if (lx < d) {
ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
tmp = cache[ai];
cache[ai] = cache[bi];
cache[bi] += tmp;
}
}
barrier (CLK_LOCAL_MEM_FENCE);
if (tx_0 - height_offset < width) {
output[tx_0] = cache[bi];
}
if (tx_0 + 1 - height_offset < width) {
output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)];
}
}
kernel void cumsum (global float *input,
global float *output,
global float *block_sums,
local float *cache,
const int group_iterations,
const int width)
{
int local_width = get_local_size (0);
int gx = get_group_id (0);
int lx = get_local_id (0);
int idy = get_global_id (1);
int height_offset = idy * width;
for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) {
sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx);
barrier (CLK_GLOBAL_MEM_FENCE);
}
if (block_sums && lx == 0) {
block_sums[gx + 2 * local_width * idy] = output[height_offset +
min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)];
}
}
kernel void spread_block_sums (global float *output,
global float *block_sums,
const int group_iterations,
const int width)
{
int local_width = get_local_size (0);
int gx = get_group_id (0);
int lx = get_local_id (0);
int tx = local_width * ((gx + 1) * group_iterations) + lx;
int idy = get_global_id (1);
for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) {
if (tx + group_offset >= width) {
break;
}
output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy];
}
}
|