summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTomas Farago <sensej007@email.cz>2019-07-31 13:28:35 +0200
committerTomas Farago <sensej007@email.cz>2020-02-05 10:16:26 +0100
commit692afa64b6e508653d091f641d8a85fc95f7733a (patch)
treebfa8adf14100691352e23294bbf203ec9bfd63bc
parent721b8f2db68f6c9cdd4955eb438d0376e41bd479 (diff)
downloadufo-filters-692afa64b6e508653d091f641d8a85fc95f7733a.tar.gz
ufo-filters-692afa64b6e508653d091f641d8a85fc95f7733a.tar.bz2
ufo-filters-692afa64b6e508653d091f641d8a85fc95f7733a.tar.xz
ufo-filters-692afa64b6e508653d091f641d8a85fc95f7733a.zip
NLM: Add fast version of the algorithm
-rw-r--r--docs/filters.rst13
-rw-r--r--src/kernels/cumsum.cl125
-rw-r--r--src/kernels/meson.build1
-rw-r--r--src/kernels/nlm.cl82
-rw-r--r--src/ufo-non-local-means-task.c463
5 files changed, 668 insertions, 16 deletions
diff --git a/docs/filters.rst b/docs/filters.rst
index d0ef056..2abe2ea 100644
--- a/docs/filters.rst
+++ b/docs/filters.rst
@@ -555,11 +555,24 @@ Non-local-means denoising
which decreases the influence of pixels towards the corners of the
patches.
+ .. gobj:prop:: fast:boolean
+
+ Use a fast version of the algorithm described in [#]_. The only
+ difference in the result from the classical algorithm is that there is
+ no Gaussian profile used and from the nature of the fast algorithm,
+ floating point precision errors might occur for large images.
+
.. gobj:prop:: addressing-mode:enum
Addressing mode specifies the behavior for pixels falling outside the
original image. See OpenCL ``sampler_t`` documentation for more information.
+ .. [#]
+ J. Darbon, A. Cunha, T.F. Chan, S. Osher, and G.J. Jensen, *Fast
+ nonlocal filtering applied to electron cryomicroscopy* in 5th IEEE
+ International Symposium on Biomedical Imaging: From Nano to Macro,
+ 2008, pp. 1331-1334. DOI:10.1109/ISBI.2008.4541250
+
Horizontal interpolation
------------------------
diff --git a/src/kernels/cumsum.cl b/src/kernels/cumsum.cl
new file mode 100644
index 0000000..4d88b93
--- /dev/null
+++ b/src/kernels/cumsum.cl
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2011-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+/* There are 16 memory banks, so divide n by their number, which offsets every
+ * thread by n / 16 */
+#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4))
+
+void sum_chunk_no_conflicts (global float *input,
+ global float *output,
+ local float *cache,
+ const int height_offset,
+ const int group_offset,
+ const int width,
+ const int local_width,
+ const int group_iterations,
+ const int gx,
+ const int lx)
+{
+ float tmp;
+ int d, ai, bi, offset = 1;
+ int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset;
+
+ if (tx_0 - height_offset < width) {
+ cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0];
+ }
+ if (tx_0 + 1 - height_offset < width) {
+ cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1];
+ }
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ // build sum in place up the tree
+ for (d = local_width; d > 0; d >>= 1) {
+ if (lx < d) {
+ ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+ bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+ cache[bi] += cache[ai];
+ }
+ offset <<= 1;
+ barrier (CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (lx == local_width - 1) {
+ tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1];
+ cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp;
+ cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp;
+ }
+
+ // traverse down tree & build scan
+ for (d = 1; d <= local_width; d <<= 1) {
+ offset >>= 1;
+ barrier (CLK_LOCAL_MEM_FENCE);
+ if (lx < d) {
+ ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+ bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+ tmp = cache[ai];
+ cache[ai] = cache[bi];
+ cache[bi] += tmp;
+ }
+ }
+ barrier (CLK_LOCAL_MEM_FENCE);
+
+ if (tx_0 - height_offset < width) {
+ output[tx_0] = cache[bi];
+ }
+ if (tx_0 + 1 - height_offset < width) {
+ output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)];
+ }
+}
+
+kernel void cumsum (global float *input,
+ global float *output,
+ global float *block_sums,
+ local float *cache,
+ const int group_iterations,
+ const int width)
+{
+ int local_width = get_local_size (0);
+ int gx = get_group_id (0);
+ int lx = get_local_id (0);
+ int idy = get_global_id (1);
+ int height_offset = idy * width;
+
+ for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) {
+ sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx);
+ barrier (CLK_GLOBAL_MEM_FENCE);
+ }
+ if (block_sums && lx == 0) {
+ block_sums[gx + 2 * local_width * idy] = output[height_offset +
+ min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)];
+ }
+}
+
+kernel void spread_block_sums (global float *output,
+ global float *block_sums,
+ const int group_iterations,
+ const int width)
+{
+ int local_width = get_local_size (0);
+ int gx = get_group_id (0);
+ int lx = get_local_id (0);
+ int tx = local_width * ((gx + 1) * group_iterations) + lx;
+ int idy = get_global_id (1);
+
+ for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) {
+ if (tx + group_offset >= width) {
+ break;
+ }
+ output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy];
+ }
+}
diff --git a/src/kernels/meson.build b/src/kernels/meson.build
index 7d33d77..03cdac5 100644
--- a/src/kernels/meson.build
+++ b/src/kernels/meson.build
@@ -9,6 +9,7 @@ kernel_files = [
'correlate.cl',
'cut.cl',
'cut-sinogram.cl',
+ 'cumsum.cl',
'denoise.cl',
'dfi.cl',
'edge.cl',
diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl
index 603767c..d2f6b0f 100644
--- a/src/kernels/nlm.cl
+++ b/src/kernels/nlm.cl
@@ -57,6 +57,88 @@ compute_dist (read_only image2d_t input,
}
kernel void
+compute_shifted_mse (read_only image2d_t input,
+ global float *output,
+ sampler_t sampler,
+ const int dx,
+ const int dy,
+ const int padding,
+ const float variance)
+{
+ /* Global dimensions are for the padded output image */
+ int idx = get_global_id (0);
+ int idy = get_global_id (1);
+ int padded_width = get_global_size (0);
+ /* Image is padded by *padding* on both sides of both dimensions. */
+ float cropped_width = (float) (padded_width - 2 * padding);
+ float cropped_height = (float) (get_global_size (1) - 2 * padding);
+ float x = ((float) (idx - padding)) + 0.5f;
+ float y = ((float) (idy - padding)) + 0.5f;
+ float a, b;
+
+ a = read_imagef (input, sampler, (float2) (x / cropped_width, y / cropped_height)).x;
+ b = read_imagef (input, sampler, (float2) ((x + dx) / cropped_width, (y + dy) / cropped_height)).x;
+
+ output[idy * padded_width + idx] = (a - b) * (a - b) - 2 * variance;
+}
+
+kernel void
+process_shift (read_only image2d_t input,
+ global float *integral_input,
+ global float *weights,
+ global float *output,
+ const sampler_t sampler,
+ const int dx,
+ const int dy,
+ const int patch_radius,
+ const int padding,
+ const float coeff,
+ const int is_conjugate)
+{
+ /* Global dimensions are for the cropped output image */
+ int idx = get_global_id (0);
+ int idy = get_global_id (1);
+ /* Image is padded by *padding* on both sides of both dimensions. */
+ int x = idx + padding;
+ int y = idy + padding;
+ float x_im = idx + dx + 0.5f;
+ float y_im = idy + dy + 0.5f;
+ int cropped_width = get_global_size (0);
+ int cropped_height = get_global_size (1);
+ int padded_width = cropped_width + 2 * padding;
+ float dist, weight;
+ if (is_conjugate) {
+ /* direct computation: g(x) = w(x) + f(x + dx)
+ * conjugate: g(x + dx) = w(x) + f(x), where idx = x + dx, so
+ * g(idx) = w(idx - dx) * f(idx - x) = w(idx - dx) * f(x_im - 2dx)
+ */
+ x -= dx;
+ y -= dy;
+ x_im -= 2 * dx;
+ y_im -= 2 * dy;
+ }
+
+ dist = integral_input[(y + patch_radius) * padded_width + x + patch_radius] -
+ integral_input[(y - patch_radius - 1) * padded_width + x + patch_radius] -
+ integral_input[(y + patch_radius) * padded_width + x - patch_radius - 1] +
+ integral_input[(y - patch_radius - 1) * padded_width + x - patch_radius - 1];
+ dist = fmax (0.0f, dist) * coeff;
+ weight = exp (-dist);
+
+ weights[idy * cropped_width + idx] += weight;
+ output[idy * cropped_width + idx] += weight * read_imagef (input, sampler, (float2) (x_im / cropped_width,
+ y_im / cropped_height)).x;
+}
+
+kernel void divide_inplace (global float *coefficients,
+ global float *output)
+{
+ int index = get_global_size (0) * get_global_id (1) + get_global_id (0);
+
+ output[index] = native_divide (output[index], coefficients[index]);
+}
+
+kernel void
nlm_noise_reduction (read_only image2d_t input,
global float *output,
sampler_t sampler,
diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c
index ee9f2ad..4c14c67 100644
--- a/src/ufo-non-local-means-task.c
+++ b/src/ufo-non-local-means-task.c
@@ -27,17 +27,22 @@
#include "ufo-non-local-means-task.h"
#include "common/ufo-addressing.h"
+#define PIXELS_PER_THREAD 4
+#define NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1)
struct _UfoNonLocalMeansTaskPrivate {
guint search_radius;
guint patch_radius;
+ gsize max_work_group_size, cropped_size[2], padded_size[2];
+ gint padding;
gfloat h;
gfloat sigma;
gboolean use_window;
- cl_kernel kernel;
+ gboolean fast;
+ cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel;
cl_sampler sampler;
cl_context context;
- cl_mem window_mem;
+ cl_mem window_mem, group_sums, aux_mem[2], weights_mem;
AddressingMode addressing_mode;
};
@@ -56,12 +61,36 @@ enum {
PROP_H,
PROP_SIGMA,
PROP_WINDOW,
+ PROP_FAST,
PROP_ADDRESSING_MODE,
N_PROPERTIES
};
static GParamSpec *properties[N_PROPERTIES] = { NULL, };
+static gint
+compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv)
+{
+ gdouble integer;
+ gint local_width;
+
+ /* Compute global and local dimensions for the cumsum kernel */
+ /* First make sure local_width is a power of 2 */
+ modf (log2 (priv->max_work_group_size), &integer);
+ local_width = (gint) pow (2, integer);
+ if (local_width > 4) {
+ /* Empirically determined value on NVIDIA cards */
+ local_width /= 4;
+ }
+ /* *local_width* is the minimum of the power-of-two-padded minimum of
+ * (width, height) and the desired local width.
+ */
+ modf (log2 (MIN (priv->padded_size[0], priv->padded_size[1]) - 1.0f), &integer);
+ local_width = (gsize) MIN (local_width, pow (2, 1 + ((gint) integer)));
+
+ return local_width;
+}
+
/**
* release_coefficients:
* @priv: UfoNonLocalMeansTaskPrivate
@@ -116,6 +145,292 @@ create_coefficients (UfoNonLocalMeansTaskPrivate *priv)
g_free (coefficients);
}
+static void
+release_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
+{
+ if (priv->group_sums) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->group_sums));
+ priv->group_sums = NULL;
+ }
+ if (priv->weights_mem) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->weights_mem));
+ priv->weights_mem = NULL;
+ }
+ for (gint i = 0; i < 2; i++) {
+ if (priv->aux_mem[i]) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->aux_mem[i]));
+ priv->aux_mem[i] = NULL;
+ }
+ }
+}
+
+static void
+create_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
+{
+ cl_int err;
+ gint local_width = compute_cumsum_local_width (priv);
+
+ priv->group_sums = clCreateBuffer (priv->context,
+ CL_MEM_READ_WRITE,
+ sizeof (cl_float) * local_width * MAX (priv->padded_size[0], priv->padded_size[1]),
+ NULL,
+ &err);
+ UFO_RESOURCES_CHECK_CLERR (err);
+
+ priv->weights_mem = clCreateBuffer (priv->context,
+ CL_MEM_READ_WRITE,
+ sizeof (cl_float) * priv->cropped_size[0] * priv->cropped_size[1],
+ NULL,
+ &err);
+ for (gint i = 0; i < 2; i++) {
+ priv->aux_mem[i] = clCreateBuffer (priv->context,
+ CL_MEM_READ_WRITE,
+ sizeof (cl_float) * priv->padded_size[0] * priv->padded_size[1],
+ NULL,
+ &err);
+ UFO_RESOURCES_CHECK_CLERR (err);
+ }
+ UFO_RESOURCES_CHECK_CLERR (err);
+}
+
+static void
+transpose (UfoNonLocalMeansTaskPrivate *priv,
+ cl_command_queue cmd_queue,
+ UfoProfiler *profiler,
+ cl_mem in_mem,
+ cl_mem out_mem,
+ const gint width,
+ const gint height)
+{
+ gsize global_size[2];
+ gsize local_size[2] = {32, 32 / PIXELS_PER_THREAD};
+ static gint iteration_number = 0;
+
+ while (local_size[0] * local_size[1] > priv->max_work_group_size) {
+ local_size[0] /= 2;
+ local_size[1] /= 2;
+ }
+ global_size[0] = ((width - 1) / local_size[0] + 1) * local_size[0];
+ global_size[1] = ((height - 1) / local_size[1] / PIXELS_PER_THREAD + 1) * local_size[1];
+ if (!iteration_number) {
+ g_debug ("Image size: %d x %d", width, height);
+ g_debug ("Transpose global work group size: %lu x %lu", global_size[0], global_size[1]);
+ g_debug ("Transpose local work group size: %lu x %lu", local_size[0], local_size[1]);
+ iteration_number++;
+ }
+
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 0, sizeof (cl_mem), &in_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 1, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 2,
+ (local_size[0] + 1) * local_size[1] * PIXELS_PER_THREAD * sizeof (cl_float), NULL));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 3, sizeof (cl_int), &width));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 4, sizeof (cl_int), &height));
+ ufo_profiler_call (profiler, cmd_queue, priv->transpose_kernel, 2, global_size, local_size);
+}
+
+static void
+compute_cumsum (UfoNonLocalMeansTaskPrivate *priv,
+ cl_command_queue cmd_queue,
+ UfoProfiler *profiler,
+ cl_mem in_mem,
+ cl_mem out_mem,
+ const gint width,
+ const gint height)
+{
+ gint local_width, num_groups, num_group_iterations, one = 1;
+ gsize cumsum_global_size[2], block_sums_global_size[2], spread_global_size[2],
+ local_size[2], spread_local_size[2];
+ gsize cache_size;
+ static gint iteration_number = 0;
+
+ local_width = compute_cumsum_local_width (priv);
+ /* Number of groups we need to process *width* pixels. If it is more than
+ * *local_width* then use only that number and every group will process more
+ * successive blocks given by group size. This is necessary in order to
+ * avoid recursion in the spreading phase of the group sums. The local cache
+ * memory is limited to *local_width*, every group stores its sum into an
+ * auxiliary buffer, which is then also summed (only one iteration needed,
+ * because there is only one group in the auxiliary buffer since we limit
+ * the number of groups to *local_width*.
+ * This is not be the final number of groups, it's just used to compute the
+ * number of iterations of every group.
+ */
+ num_groups = MIN (local_width, NUM_CHUNKS (width, local_width));
+ /* Number of iterations of every group is given by the number of pixels
+ * divided by the number of pixels *num_groups* can process. */
+ num_group_iterations = NUM_CHUNKS (width, local_width * num_groups);
+ /* Finally, the real number of groups is given by the number of pixels
+ * divided by the group size and the number of group iterations. */
+ num_groups = NUM_CHUNKS (width, num_group_iterations * local_width);
+
+ /* Cache size must be larger by *local_size* / 16 because of the bank
+ * conflicts avoidance. Additionally, +1 is needed because of the shifted
+ * access to the local memory.
+ */
+ cache_size = sizeof (cl_float) * (local_width + NUM_CHUNKS (local_width, 16) + 1);
+ cumsum_global_size[0] = num_groups * local_width / 2;
+ cumsum_global_size[1] = height;
+ block_sums_global_size[0] = local_width / 2;
+ block_sums_global_size[1] = height;
+ spread_global_size[0] = (num_groups - 1) * local_width;
+ spread_global_size[1] = height;
+ local_size[0] = local_width / 2;
+ local_size[1] = 1;
+ spread_local_size[0] = local_width;
+ spread_local_size[1] = 1;
+ if (!iteration_number) {
+ g_debug (" width: %d", width);
+ g_debug (" local width: %d", local_width);
+ g_debug (" num groups: %d", num_groups);
+ g_debug ("group iterations: %d", num_group_iterations);
+ g_debug (" kernel dims: %lu %lu %lu %lu", cumsum_global_size[0], cumsum_global_size[1], local_size[0], local_size[1]);
+ g_debug (" cache size: %lu", cache_size);
+ iteration_number++;
+ }
+
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &in_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), &priv->group_sums));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &num_group_iterations));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &width));
+ ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, cumsum_global_size, local_size);
+
+ if (num_groups > 1) {
+ /* If there are more than one group, we need to spread the partial sums
+ * of the groups to successive groups. First compute the sums of the
+ * groups and then spread them to respective pixels in them. Because of
+ * our choice of number of iterations per group above, the partial sums
+ * have to be summed only once without recursion.
+ */
+ /* First sum the partial sums. */
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &priv->group_sums));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &priv->group_sums));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), NULL));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &one));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &local_width));
+ ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, block_sums_global_size, local_size);
+
+ /* Spread them across all pixels. */
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 0, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 1, sizeof (cl_mem), &priv->group_sums));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 2, sizeof (gint), &num_group_iterations));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 3, sizeof (gint), &width));
+ ufo_profiler_call (profiler, cmd_queue, priv->spread_kernel, 2, spread_global_size, spread_local_size);
+ }
+}
+
+static void
+compute_sdx (UfoNonLocalMeansTaskPrivate *priv,
+ cl_command_queue cmd_queue,
+ UfoProfiler *profiler,
+ cl_mem in_mem,
+ cl_mem out_mem,
+ const gint dx,
+ const gint dy)
+{
+ gfloat var = priv->sigma * priv->sigma;
+
+ /* First compute the shifted MSE */
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 0, sizeof (cl_mem), &in_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 1, sizeof (cl_mem), &priv->aux_mem[0]));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 2, sizeof (cl_sampler), &priv->sampler));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 3, sizeof (gint), &dx));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 4, sizeof (gint), &dy));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 5, sizeof (gint), &priv->padding));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 6, sizeof (gint), &var));
+ ufo_profiler_call (profiler, cmd_queue, priv->mse_kernel, 2, priv->padded_size, NULL);
+
+ /* Horizontal cumsum and transposition */
+ compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[0], priv->padded_size[1]);
+ transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
+ priv->padded_size[0], priv->padded_size[1]);
+
+ /* 2D cumsum is separable, so compute the horizontal cumsum of the transposed horizontally cumsummed image */
+ compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[1], priv->padded_size[0]);
+ transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
+ priv->padded_size[1], priv->padded_size[0]);
+}
+
+static void
+process_shift (UfoNonLocalMeansTaskPrivate *priv,
+ cl_command_queue cmd_queue,
+ UfoProfiler *profiler,
+ cl_mem input_image,
+ cl_mem integral_mem,
+ cl_mem out_mem,
+ const gint dx,
+ const gint dy)
+{
+ gint patch_size = 2 * priv->patch_radius + 1;
+ gfloat coeff = 1.0f / (priv->h * priv->h * patch_size * patch_size);
+ gint is_conjugate = 0;
+
+ /* Compute r(x) = w(x) * f(x + dx) */
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
+ ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
+
+ if (dx != 0 || dy != 0) {
+ /* Compute r(x + dx) = w(x + dx) * f(x), which cannot be computed at
+ * once in the above kernel call because one work item would write to
+ * two global memory locations, thus creating a race condition. */
+ is_conjugate = 1;
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
+ ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
+ }
+}
+
+static void
+compute_nlm_fast (UfoNonLocalMeansTaskPrivate *priv,
+ cl_command_queue cmd_queue,
+ UfoProfiler *profiler,
+ cl_mem in_mem,
+ cl_mem out_mem)
+{
+ gint dx, dy, sr;
+ sr = (gint) priv->search_radius;
+
+ /* Every pixel computes its own result and spreads it to the pixel y + dy,
+ * thus we start at dy = 0 every pixel gets its dy < 0 value from the pixel
+ * y - dy. For dy = 0, compute only pixels to the right, so the negative
+ * values are obtained from the pixels to the left (analogous to the dy
+ * case).
+ */
+ for (dy = 0; dy < sr + 1; dy++) {
+ for (dx = dy == 0 ? 0 : -sr; dx < sr + 1; dx++) {
+ compute_sdx (priv, cmd_queue, profiler, in_mem, out_mem, dx, dy);
+ process_shift (priv, cmd_queue, profiler, in_mem, priv->aux_mem[0], out_mem, dx, dy);
+ }
+ }
+ /* Now we have the sum of results and weights, divide them to get the
+ * result.
+ */
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 0, sizeof (cl_mem), &priv->weights_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 1, sizeof (cl_mem), &out_mem));
+ ufo_profiler_call (profiler, cmd_queue, priv->divide_kernel, 2, priv->cropped_size, NULL);
+}
+
UfoNode *
ufo_non_local_means_task_new (void)
{
@@ -131,11 +446,44 @@ ufo_non_local_means_task_setup (UfoTask *task,
cl_int err;
priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
- priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error);
+ if (priv->fast) {
+ priv->mse_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "compute_shifted_mse", NULL, error);
+ priv->cumsum_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "cumsum", NULL, error);
+ priv->spread_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "spread_block_sums", NULL, error);
+ priv->transpose_kernel = ufo_resources_get_kernel (resources, "transpose.cl", "transpose_shared", NULL, error);
+ priv->weight_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "process_shift", NULL, error);
+ priv->divide_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "divide_inplace", NULL, error);
+ } else {
+ priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error);
+ }
priv->window_mem = NULL;
+ priv->group_sums = NULL;
+ priv->weights_mem = NULL;
+ for (gint i = 0; i < 2; i++) {
+ priv->aux_mem[i] = NULL;
+ }
- if (priv->kernel)
+ if (priv->kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error);
+ }
+ if (priv->mse_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->mse_kernel), error);
+ }
+ if (priv->cumsum_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->cumsum_kernel), error);
+ }
+ if (priv->spread_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->spread_kernel), error);
+ }
+ if (priv->transpose_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->transpose_kernel), error);
+ }
+ if (priv->weight_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->weight_kernel), error);
+ }
+ if (priv->divide_kernel) {
+ UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error);
+ }
priv->context = ufo_resources_get_context (resources);
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error);
@@ -155,10 +503,35 @@ ufo_non_local_means_task_get_requisition (UfoTask *task,
GError **error)
{
UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
+ UfoGpuNode *node;
+ GValue *max_work_group_size_gvalue;
+
ufo_buffer_get_requisition (inputs[0], requisition);
+
if (priv->use_window && !priv->window_mem) {
create_coefficients (priv);
}
+ if (priv->fast) {
+ if (priv->max_work_group_size == 0) {
+ node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
+ max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE);
+ priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue);
+ g_value_unset (max_work_group_size_gvalue);
+ }
+ if (priv->cropped_size[0] == 0 || priv->cropped_size[0] != requisition->dims[0] ||
+ priv->cropped_size[1] != requisition->dims[1]) {
+ if (priv->cropped_size[0] != 0) {
+ /* Size changed, release first. */
+ release_fast_buffers (priv);
+ }
+ priv->cropped_size[0] = requisition->dims[0];
+ priv->cropped_size[1] = requisition->dims[1];
+ priv->padding = priv->search_radius + priv->patch_radius + 1;
+ priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding;
+ priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding;
+ create_fast_buffers (priv);
+ }
+ }
}
static guint
@@ -189,30 +562,43 @@ ufo_non_local_means_task_process (UfoTask *task,
UfoNonLocalMeansTaskPrivate *priv;
UfoGpuNode *node;
UfoProfiler *profiler;
+ UfoRequisition in_req;
cl_command_queue cmd_queue;
cl_mem in_mem;
cl_mem out_mem;
gfloat h, var;
+ gfloat fill_pattern = 0.0f;
priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
cmd_queue = ufo_gpu_node_get_cmd_queue (node);
- in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue);
out_mem = ufo_buffer_get_device_array (output, cmd_queue);
+ profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
+ ufo_buffer_get_requisition (inputs[0], &in_req);
h = 1 / priv->h / priv->h;
var = priv->sigma * priv->sigma;
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var));
- UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem));
-
- profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
- ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL);
+ if (priv->fast) {
+ in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue);
+ UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0,
+ sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
+ 0, NULL, NULL));
+ UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, priv->weights_mem, &fill_pattern, sizeof (gfloat), 0,
+ sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
+ 0, NULL, NULL));
+ compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem);
+ } else {
+ in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue);
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var));
+ UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem));
+ ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL);
+ }
return TRUE;
}
@@ -242,6 +628,9 @@ ufo_non_local_means_task_set_property (GObject *object,
case PROP_WINDOW:
priv->use_window = g_value_get_boolean (value);
break;
+ case PROP_FAST:
+ priv->fast = g_value_get_boolean (value);
+ break;
case PROP_ADDRESSING_MODE:
priv->addressing_mode = g_value_get_enum (value);
break;
@@ -275,6 +664,9 @@ ufo_non_local_means_task_get_property (GObject *object,
case PROP_WINDOW:
g_value_set_boolean (value, priv->use_window);
break;
+ case PROP_FAST:
+ g_value_set_boolean (value, priv->fast);
+ break;
case PROP_ADDRESSING_MODE:
g_value_set_enum (value, priv->addressing_mode);
break;
@@ -295,6 +687,30 @@ ufo_non_local_means_task_finalize (GObject *object)
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel));
priv->kernel = NULL;
}
+ if (priv->mse_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->mse_kernel));
+ priv->mse_kernel = NULL;
+ }
+ if (priv->cumsum_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->cumsum_kernel));
+ priv->cumsum_kernel = NULL;
+ }
+ if (priv->spread_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->spread_kernel));
+ priv->spread_kernel = NULL;
+ }
+ if (priv->transpose_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->transpose_kernel));
+ priv->transpose_kernel = NULL;
+ }
+ if (priv->weight_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->weight_kernel));
+ priv->weight_kernel = NULL;
+ }
+ if (priv->divide_kernel) {
+ UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel));
+ priv->divide_kernel = NULL;
+ }
if (priv->sampler) {
UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler));
priv->sampler = NULL;
@@ -304,6 +720,7 @@ ufo_non_local_means_task_finalize (GObject *object)
priv->context = NULL;
}
release_coefficients (priv);
+ release_fast_buffers (priv);
G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object);
}
@@ -363,6 +780,13 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass)
TRUE,
G_PARAM_READWRITE);
+ properties[PROP_FAST] =
+ g_param_spec_boolean ("fast",
+ "Use a faster version of the algorithm",
+ "Use a faster version of the algorithm",
+ TRUE,
+ G_PARAM_READWRITE);
+
properties[PROP_ADDRESSING_MODE] =
g_param_spec_enum ("addressing-mode",
"Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")",
@@ -384,8 +808,15 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self)
self->priv->search_radius = 10;
self->priv->patch_radius = 3;
+ self->priv->cropped_size[0] = 0;
+ self->priv->cropped_size[1] = 0;
+ self->priv->padded_size[0] = 0;
+ self->priv->padded_size[1] = 0;
+ self->priv->padding = -1;
self->priv->h = 0.1f;
self->priv->sigma = 0.0f;
+ self->priv->max_work_group_size = 0;
self->priv->use_window = TRUE;
self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT;
+ self->priv->fast = FALSE;
}