Merge pull request #182 from ufo-kit/nlm

Nlm
author: Tomas Farago <sensej007@email.cz> 2020-02-05 10:27:41 +0100
committer: GitHub <noreply@github.com> 2020-02-05 10:27:41 +0100
commit: 076ef3b717c58ae1fca593ab936c649aa14936a9 (patch)
tree: 73d463b635762db907c61fd1737eb4040d9661b6
parent: 796ba68cfae854736c0adae6cdb56f93ef525e64 (diff)
parent: c94277081c5c0d6cb9f803bc96523dc7ff73a53a (diff)
download: ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.gz
ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.bz2
ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.xz
ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.zip
15 files changed, 1169 insertions, 70 deletions
diff --git a/docs/filters.rst b/docs/filters.rst
index c04b536..8afd0cb 100644
--- a/docs/filters.rst
+++ b/docs/filters.rst
@@ -538,9 +538,56 @@ Non-local-means denoising
 
         Radius of patches.
 
+    .. gobj:prop:: h:float
+
+        Smoothing control parameter, should be around noise standard deviation
+        or slightly less. Higher h results in a smoother image but with blurred
+        features. If it is 0, estimate noise standard deviation and use it as
+        the parameter value.
+
     .. gobj:prop:: sigma:float
 
-        Sigma influencing the Gaussian weighting.
+        Noise standard deviation, improves weights computation. If it is zero,
+        it is *not* automatically estimated as opposed to :gobj:prop:`h`.
+        :gobj:prop:`estimate-sigma` has to be specified in order to override
+        :gobj:prop:`sigma` value.
+
+    .. gobj:prop:: window:boolean
+
+        Apply Gaussian profile with :math:`\sigma = \frac{P}{2}`, where :math:`P`
+        is the :gobj:prop:`patch-radius` parameter to the weight computation
+        which decreases the influence of pixels towards the corners of the
+        patches.
+
+    .. gobj:prop:: fast:boolean
+
+        Use a fast version of the algorithm described in [#]_. The only
+        difference in the result from the classical algorithm is that there is
+        no Gaussian profile used and from the nature of the fast algorithm,
+        floating point precision errors might occur for large images.
+
+    .. gobj:prop:: estimate-sigma:boolean
+
+        Estimate sigma based on [#]_, which overrides :gobj:prop:`sigma`
+        parameter value. Only the first image in a sequence is used for
+        estimation and the estimated sigma is re-used for every consequent
+        image.
+
+    .. gobj:prop:: addressing-mode:enum
+
+        Addressing mode specifies the behavior for pixels falling outside the
+        original image. See OpenCL ``sampler_t`` documentation for more information.
+
+    .. [#]
+        J. Darbon, A. Cunha, T.F. Chan, S. Osher, and G.J. Jensen, *Fast
+        nonlocal filtering applied to electron cryomicroscopy* in 5th IEEE
+        International Symposium on Biomedical Imaging: From Nano to Macro,
+        2008, pp. 1331-1334. DOI:10.1109/ISBI.2008.4541250
+
+    .. [#]
+        J. Immerkaer, *Fast noise variance estimation* in Computer vision and image
+        understanding 64.2 (1996): 300-302. DOI:10.1006/cviu.1996.0060
+
 
 
 Horizontal interpolation
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4bc70c..6f69f19 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -127,6 +127,10 @@ set(general_backproject_aux_SRCS
     common/ufo-scarray.c
     common/ufo-ctgeometry.c)
 
+set(non_local_means_aux_SRCS
+    common/ufo-math.c
+    common/ufo-common.c)
+
 file(GLOB ufofilter_KERNELS "kernels/*.cl")
 #}}}
 #{{{ Variables
diff --git a/src/common/ufo-common.c b/src/common/ufo-common.c
new file mode 100644
index 0000000..f157cf4
--- /dev/null
+++ b/src/common/ufo-common.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (C) 2015-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include <glib.h>
+#include "ufo-math.h"
+#include "ufo-common.h"
+
+gfloat
+ufo_common_estimate_sigma (cl_kernel convolution_kernel,
+                           cl_kernel sum_kernel,
+                           cl_command_queue cmd_queue,
+                           cl_sampler sampler,
+                           UfoProfiler *profiler,
+                           cl_mem input_image,
+                           cl_mem out_mem,
+                           const gsize max_work_group_size,
+                           const gsize *global_size)
+{
+    gsize n = global_size[0] * global_size[1];
+    gsize local_size, num_groups, global_size_1D;
+    gint num_group_iterations;
+    gfloat *result, sum = 0.0f;
+    cl_int err;
+    cl_mem group_sums;
+    cl_context context;
+    
+    clGetCommandQueueInfo (cmd_queue, CL_QUEUE_CONTEXT, sizeof (cl_context), &context, NULL);
+
+    /* First compute the convolution of the input with the difference of
+     * laplacians.
+     */
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 0, sizeof (cl_mem), &input_image));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 1, sizeof (cl_sampler), &sampler));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 2, sizeof (cl_mem), &out_mem));
+    ufo_profiler_call (profiler, cmd_queue, convolution_kernel, 2, global_size, NULL);
+
+    /* Now compute partial sums of the convolved image. */
+    /* Compute global and local dimensions for the cumsum kernel */
+    /* Make sure local_size is a power of 2 */
+    local_size = ufo_math_compute_closest_smaller_power_of_2 (max_work_group_size);
+    /* Number of iterations of every group is given by the number of pixels
+     * divided by the number of pixels *num_groups* can process. */
+    num_groups = MIN (local_size, UFO_MATH_NUM_CHUNKS (n, local_size));
+    num_group_iterations = UFO_MATH_NUM_CHUNKS (n, local_size * num_groups);
+    /* The real number of groups is given by the number of pixels
+     * divided by the group size and the number of group iterations. */
+    num_groups = UFO_MATH_NUM_CHUNKS (n, num_group_iterations * local_size);
+    global_size_1D = num_groups * local_size;
+
+    g_debug ("                 n: %lu", n);
+    g_debug ("        num groups: %lu", num_groups);
+    g_debug ("  group iterations: %d", num_group_iterations);
+    g_debug ("kernel global size: %lu", global_size_1D);
+    g_debug (" kernel local size: %lu", local_size);
+
+    result = g_malloc0 (sizeof (cl_float) * num_groups);
+    group_sums = clCreateBuffer (context,
+                                 CL_MEM_READ_WRITE,
+                                 sizeof (cl_float) * num_groups,
+                                 NULL,
+                                 &err);
+    UFO_RESOURCES_CHECK_CLERR (err);
+
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 0, sizeof (cl_mem), &out_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 1, sizeof (cl_mem), &group_sums));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 2, sizeof (cl_mem), &out_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 3, sizeof (cl_float) * local_size, NULL));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 4, sizeof (gsize), &n));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 5, sizeof (gint), &num_group_iterations));
+    ufo_profiler_call (profiler, cmd_queue, sum_kernel, 1, &global_size_1D, &local_size);
+
+    clEnqueueReadBuffer (cmd_queue,
+                         group_sums,
+                         CL_TRUE,
+                         0, sizeof (cl_float) * num_groups,
+                         result,
+                         0, NULL, NULL);
+    UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (group_sums));
+
+    /* Sum partial sums computed by the groups. */
+    for (gsize i = 0; i < num_groups; i++) {
+        sum += result[i];
+    }
+    g_free (result);
+
+    return sqrt (G_PI_2) / (6 * (global_size[0] - 2.0f) * (global_size[1] - 2.0f)) * sum;
+}
diff --git a/src/common/ufo-common.h b/src/common/ufo-common.h
new file mode 100644
index 0000000..f6e2349
--- /dev/null
+++ b/src/common/ufo-common.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2015-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef UFO_COMMON_H
+#define UFO_COMMON_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <ufo/ufo.h>
+
+gfloat          ufo_common_estimate_sigma                  (cl_kernel convolution_kernel,
+                                                            cl_kernel sum_kernel,
+                                                            cl_command_queue cmd_queue,
+                                                            cl_sampler sampler,
+                                                            UfoProfiler *profiler,
+                                                            cl_mem input_image,
+                                                            cl_mem out_mem,
+                                                            const gsize max_work_group_size,
+                                                            const gsize *global_size);
+
+#endif
diff --git a/src/common/ufo-math.c b/src/common/ufo-math.c
index 78b7dda..725b7e6 100644
--- a/src/common/ufo-math.c
+++ b/src/common/ufo-math.c
@@ -140,3 +140,12 @@ ufo_array_minimum (gdouble *array, gint num_values)
 {
     return find_extremum (array, num_values, 1);
 }
+
+gsize
+ufo_math_compute_closest_smaller_power_of_2 (gsize value)
+{
+    gdouble integer;
+    modf (log2 (value), &integer);
+
+    return (gsize) pow (2, integer);
+}
diff --git a/src/common/ufo-math.h b/src/common/ufo-math.h
index 70084ce..8b283e1 100644
--- a/src/common/ufo-math.h
+++ b/src/common/ufo-math.h
@@ -24,6 +24,8 @@
 
 #define UFO_MATH_EPSILON 1e-7
 #define UFO_MATH_ARE_ALMOST_EQUAL(a, b) (ABS ((a) - (b)) < UFO_MATH_EPSILON)
+#define UFO_MATH_NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1)
+
 
 typedef struct {
     gdouble x, y, z;
@@ -54,5 +56,6 @@ gdouble          ufo_array_minimum              (gdouble     *array,
 gdouble          ufo_clip_value                 (gdouble value,
                                                  gdouble minimum,
                                                  gdouble maximum);
+gsize            ufo_math_compute_closest_smaller_power_of_2 (gsize value);
 
 #endif
diff --git a/src/kernels/cumsum.cl b/src/kernels/cumsum.cl
new file mode 100644
index 0000000..4d88b93
--- /dev/null
+++ b/src/kernels/cumsum.cl
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2011-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/* There are 16 memory banks, so divide n by their number, which offsets every
+ * thread by n / 16 */
+#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4))
+
+void sum_chunk_no_conflicts (global float *input,
+                             global float *output,
+                             local float *cache,
+                             const int height_offset,
+                             const int group_offset,
+                             const int width,
+                             const int local_width,
+                             const int group_iterations,
+                             const int gx,
+                             const int lx)
+{
+    float tmp;
+    int d, ai, bi, offset = 1;
+    int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset;
+
+    if (tx_0 - height_offset < width) {
+        cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0];
+    }
+    if (tx_0 + 1 - height_offset < width) {
+        cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1];
+    }
+    barrier (CLK_LOCAL_MEM_FENCE);
+
+    // build sum in place up the tree
+    for (d = local_width; d > 0; d >>= 1) {
+        if (lx < d) {
+            ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+            bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+            cache[bi] += cache[ai];
+        }
+        offset <<= 1;
+        barrier (CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lx == local_width - 1) {
+        tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1];
+        cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp;
+        cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp;
+    }
+
+    // traverse down tree & build scan
+    for (d = 1; d <= local_width; d <<= 1) {
+        offset >>= 1;
+        barrier (CLK_LOCAL_MEM_FENCE);
+        if (lx < d) {
+            ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1);
+            bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1);
+            tmp = cache[ai];
+            cache[ai] = cache[bi];
+            cache[bi] += tmp;
+        }
+    }
+    barrier (CLK_LOCAL_MEM_FENCE);
+
+    if (tx_0 - height_offset < width) {
+        output[tx_0] = cache[bi];
+    }
+    if (tx_0 + 1 - height_offset < width) {
+        output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)];
+    }
+}
+
+kernel void cumsum (global float *input,
+                    global float *output,
+                    global float *block_sums,
+                    local float *cache,
+                    const int group_iterations,
+                    const int width)
+{
+    int local_width = get_local_size (0);
+    int gx = get_group_id (0);
+    int lx = get_local_id (0);
+    int idy = get_global_id (1);
+    int height_offset = idy * width;
+
+    for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) {
+        sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx);
+        barrier (CLK_GLOBAL_MEM_FENCE);
+    }
+    if (block_sums && lx == 0) {
+        block_sums[gx + 2 * local_width * idy] = output[height_offset +
+                                                        min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)];
+    }
+}
+
+kernel void spread_block_sums (global float *output,
+                               global float *block_sums,
+                               const int group_iterations,
+                               const int width)
+{
+    int local_width = get_local_size (0);
+    int gx = get_group_id (0);
+    int lx = get_local_id (0);
+    int tx = local_width * ((gx + 1) * group_iterations) + lx;
+    int idy = get_global_id (1);
+
+    for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) {
+        if (tx + group_offset >= width) {
+            break;
+        }
+        output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy];
+    }
+}
diff --git a/src/kernels/estimate-noise.cl b/src/kernels/estimate-noise.cl
new file mode 100644
index 0000000..31815d3
--- /dev/null
+++ b/src/kernels/estimate-noise.cl
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2011-2019 Karlsruhe Institute of Technology
+ *
+ * This file is part of Ufo.
+ *
+ * This library is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+kernel void
+convolve_abs_laplacian_diff (read_only image2d_t input,
+                             const sampler_t sampler,
+                             global float *output)
+{
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    int width = get_global_size (0);
+    int height = get_global_size (1);
+    float sum = 0.0f;
+    float coeffs[3][3] = {{ 1.0f, -2.0f,  1.0f},
+                        {-2.0f,  4.0f, -2.0f},
+                        { 1.0f, -2.0f,  1.0f}};
+
+    for (int dy = -1; dy < 2; dy++) {
+        for (int dx = -1; dx < 2; dx++) {
+            sum += coeffs[dy + 1][dx + 1] * read_imagef (input, sampler, (float2) ((idx + dx + 0.5f) / width,
+                                                                                   (idy + dy + 0.5f) / height)).x;
+        }
+    }
+
+    output[idy * width + idx] = fabs (sum);
+}
diff --git a/src/kernels/meson.build b/src/kernels/meson.build
index 7d33d77..4126c27 100644
--- a/src/kernels/meson.build
+++ b/src/kernels/meson.build
@@ -9,9 +9,11 @@ kernel_files = [
     'correlate.cl',
     'cut.cl',
     'cut-sinogram.cl',
+    'cumsum.cl',
     'denoise.cl',
     'dfi.cl',
     'edge.cl',
+    'estimate-noise.cl',
     'ffc.cl',
     'fft.cl',
     'fftmult.cl',
diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl
index 8b1bcb8..a094f34 100644
--- a/src/kernels/nlm.cl
+++ b/src/kernels/nlm.cl
@@ -17,63 +17,150 @@
  * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define flatten(x,y,r,w) ((y-r)*w + (x-r))
-
-/* Compute the distance of two neighbourhood vectors _starting_ from index i
-   and j and edge length radius */
 float
-dist (global float *input,
-      int i,
-      int j,
-      int radius,
-      int image_width)
+compute_dist (read_only image2d_t input,
+              sampler_t sampler,
+              float2 p,
+              float2 q,
+              int radius,
+              int width,
+              int height,
+              float h_2,
+              float variance,
+              constant float *window_coeffs)
 {
     float dist = 0.0f, tmp;
-    float wsize = (2.0f * radius + 1.0f);
-    wsize *= wsize;
-
-    const int nb_width = 2 * radius + 1;
-    const int stride = image_width - nb_width;
-    for (int k = 0; k < nb_width; k++, i += stride, j += stride) {
-        for (int l = 0; l < nb_width; l++, i++, j++) {
-            tmp = input[i] - input[j];
-            dist += tmp * tmp;
+    int wsize = (2 * radius + 1);
+    float coeff = h_2;
+
+    if (!window_coeffs) {
+        /* Gaussian window is normalized to sum=1, so if it is used, we are done
+         * with just summation. If it is not, we need to compute the mean. */
+        coeff /= wsize * wsize;
+    }
+
+    for (int j = -radius; j < radius + 1; j++) {
+        for (int i = -radius; i < radius + 1; i++) {
+            tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x -
+                    read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x;
+            if (window_coeffs) {
+                dist += window_coeffs[(j + radius) * wsize + (i + radius)] * (tmp * tmp - 2 * variance);
+            } else {
+                dist += tmp * tmp - 2 * variance;
+            }
         }
     }
-    return dist / wsize;
+
+    return fmax (0.0f, dist) * coeff;
+}
+
+kernel void
+compute_shifted_mse (read_only image2d_t input,
+                     global float *output,
+                     sampler_t sampler,
+                     const int dx,
+                     const int dy,
+                     const int padding,
+                     const float variance)
+{
+    /* Global dimensions are for the padded output image */
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    int padded_width = get_global_size (0);
+    /* Image is padded by *padding* on both sides of both dimensions. */
+    float cropped_width = (float) (padded_width - 2 * padding);
+    float cropped_height = (float) (get_global_size (1) - 2 * padding);
+    float x = ((float) (idx - padding)) + 0.5f;
+    float y = ((float) (idy - padding)) + 0.5f;
+    float a, b;
+
+    a = read_imagef (input, sampler, (float2) (x / cropped_width, y / cropped_height)).x;
+    b = read_imagef (input, sampler, (float2) ((x + dx) / cropped_width, (y + dy) / cropped_height)).x;
+
+    output[idy * padded_width + idx] = (a - b) * (a - b) - 2 * variance;
+}
+
+kernel void
+process_shift (read_only image2d_t input,
+               global float *integral_input,
+               global float *weights,
+               global float *output,
+               const sampler_t sampler,
+               const int dx,
+               const int dy,
+               const int patch_radius,
+               const int padding,
+               const float coeff,
+               const int is_conjugate)
+{
+    /* Global dimensions are for the cropped output image */
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    /* Image is padded by *padding* on both sides of both dimensions. */
+    int x = idx + padding;
+    int y = idy + padding;
+    float x_im = idx + dx + 0.5f;
+    float y_im = idy + dy + 0.5f;
+    int cropped_width = get_global_size (0);
+    int cropped_height = get_global_size (1);
+    int padded_width = cropped_width + 2 * padding;
+    float dist, weight;
+    if (is_conjugate) {
+        /* direct computation: g(x) = w(x) + f(x + dx)
+         * conjugate: g(x + dx) = w(x) + f(x), where idx = x + dx, so
+         * g(idx) = w(idx - dx) * f(idx - x) = w(idx - dx) * f(x_im - 2dx)
+         */
+        x -= dx;
+        y -= dy;
+        x_im -= 2 * dx;
+        y_im -= 2 * dy;
+    }
+
+    dist = integral_input[(y + patch_radius) * padded_width + x + patch_radius] -
+           integral_input[(y - patch_radius - 1) * padded_width + x + patch_radius] -
+           integral_input[(y + patch_radius) * padded_width + x - patch_radius - 1] +
+           integral_input[(y - patch_radius - 1) * padded_width + x - patch_radius - 1];
+    dist = fmax (0.0f, dist) * coeff;
+    weight = exp (-dist);
+
+    weights[idy * cropped_width + idx] += weight;
+    output[idy * cropped_width + idx] += weight * read_imagef (input, sampler, (float2) (x_im / cropped_width,
+                                                                                         y_im / cropped_height)).x;
+}
+
+kernel void divide_inplace (global float *coefficients,
+                            global float *output)
+{
+    int index = get_global_size (0) * get_global_id (1) + get_global_id (0);
+
+    output[index] = native_divide (output[index], coefficients[index]);
 }
 
 kernel void
-nlm_noise_reduction (global float *input,
+nlm_noise_reduction (read_only image2d_t input,
                      global float *output,
+                     sampler_t sampler,
                      const int search_radius,
                      const int patch_radius,
-                     const float sigma)
+                     const float h_2,
+                     const float variance,
+                     constant float *window_coeffs)
 {
     const int x = get_global_id (0);
     const int y = get_global_id (1);
     const int width = get_global_size (0);
     const int height = get_global_size (1);
-    const float sigma_2 = sigma * sigma;
+    float d, weight;
 
     float total_weight = 0.0f;
     float pixel_value = 0.0f;
 
-    /* 
-     * Compute the upper left (sx,sy) and lower right (tx, ty) corner points of
-     * our search window.
-     */
-    int r = min (patch_radius, min(width - 1 - x, min (height - 1 - y, min (x, y))));
-    int sx = max (x - search_radius, r);
-    int sy = max (y - search_radius, r);
-    int tx = min (x + search_radius, width - 1 - r);
-    int ty = min (y + search_radius, height - 1 - r);
-
-    for (int i = sx; i < tx; i++) {
-        for (int j = sy; j < ty; j++) {
-            float d = dist (input, flatten(x, y, r, width), flatten (i,j,r,width), r, width);
-            float weight = exp (- sigma_2 * d);
-            pixel_value += weight * input[j * width + i];
+    for (int j = y - search_radius; j < y + search_radius + 1; j++) {
+        for (int i = x - search_radius; i < x + search_radius + 1; i++) {
+            d = compute_dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f),
+                      patch_radius, width, height, h_2, variance, window_coeffs);
+            weight = exp (-d);
+            pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x;
             total_weight += weight;
         }
     }
diff --git a/src/meson.build b/src/meson.build
index eb981b2..4cd4bdc 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -48,7 +48,6 @@ plugins = [
     'merge',
     'metaballs',
     'monitor',
-    'non-local-means',
     'null',
     'opencl',
     'opencl-reduce',
@@ -171,6 +170,20 @@ shared_module('conebeamprojectionweight',
     install_dir: plugin_install_dir,
 )
 
+# non local means
+
+shared_module('nonlocalmeans',
+    sources: [
+        'ufo-non-local-means-task.c',
+        'common/ufo-math.c',
+        'common/ufo-common.c',
+    ],
+    dependencies: deps,
+    name_prefix: 'libufofilter',
+    install: true,
+    install_dir: plugin_install_dir,
+)
+
 # fft plugins
 
 have_clfft = clfft_dep.found()
diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c
index 650cecc..4848d42 100644
--- a/src/ufo-non-local-means-task.c
+++ b/src/ufo-non-local-means-task.c
@@ -22,15 +22,31 @@
 #else
 #include <CL/cl.h>
 #endif
+#include <math.h>
 
 #include "ufo-non-local-means-task.h"
+#include "common/ufo-math.h"
+#include "common/ufo-addressing.h"
+#include "common/ufo-common.h"
 
+#define PIXELS_PER_THREAD 4
 
 struct _UfoNonLocalMeansTaskPrivate {
     guint search_radius;
     guint patch_radius;
+    gsize max_work_group_size, cropped_size[2], padded_size[2];
+    gint padding;
+    gfloat h;
     gfloat sigma;
-    cl_kernel kernel;
+    gboolean use_window;
+    gboolean fast;
+    gboolean estimate_sigma;
+    cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel,
+              convolution_kernel, sum_kernel;
+    cl_sampler sampler;
+    cl_context context;
+    cl_mem window_mem, group_sums, aux_mem[2], weights_mem;
+    AddressingMode addressing_mode;
 };
 
 static void ufo_task_interface_init (UfoTaskIface *iface);
@@ -45,12 +61,366 @@ enum {
     PROP_0,
     PROP_SEARCH_RADIUS,
     PROP_PATCH_RADIUS,
+    PROP_H,
     PROP_SIGMA,
+    PROP_WINDOW,
+    PROP_FAST,
+    PROP_ESTIMATE_SIGMA,
+    PROP_ADDRESSING_MODE,
     N_PROPERTIES
 };
 
 static GParamSpec *properties[N_PROPERTIES] = { NULL, };
 
+static gint
+compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv)
+{
+    gdouble integer;
+    gint local_width;
+
+    /* Compute global and local dimensions for the cumsum kernel */
+    /* First make sure local_width is a power of 2 */
+    local_width = (gint) ufo_math_compute_closest_smaller_power_of_2 (priv->max_work_group_size);
+    if (local_width > 4) {
+        /* Empirically determined value on NVIDIA cards */
+        local_width /= 4;
+    }
+    /* *local_width* is the minimum of the power-of-two-padded minimum of
+     * (width, height) and the desired local width.
+     */
+    modf (log2 (MIN (priv->padded_size[0], priv->padded_size[1]) - 1.0f), &integer);
+    local_width = (gsize) MIN (local_width, pow (2, 1 + ((gint) integer)));
+
+    return local_width;
+}
+
+static void
+release_gaussian_window (UfoNonLocalMeansTaskPrivate *priv)
+{
+    if (priv->window_mem) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->window_mem));
+        priv->window_mem = NULL;
+    }
+}
+
+static void
+create_gaussian_window (UfoNonLocalMeansTaskPrivate *priv)
+{
+    cl_int err;
+    gfloat *coefficients, coefficients_sum = 0.0f, sigma;
+    gsize wsize;
+    gint r;
+
+    wsize = 2 * priv->patch_radius + 1;
+    r = (gint) priv->patch_radius;
+    coefficients = g_malloc0 (sizeof (gfloat) * wsize * wsize);
+    sigma = priv->patch_radius / 2.0f;
+
+    for (gint y = 0; y < (gint) wsize; y++) {
+        for (gint x = 0; x < (gint) wsize; x++) {
+            coefficients[y * wsize + x] = exp (- ((x - r) * (x - r) + (y - r) * (y - r)) / (sigma * sigma * 2.0f));
+            coefficients_sum += coefficients[y * wsize + x];
+        }
+    }
+    for (guint i = 0; i < wsize * wsize; i++) {
+        coefficients[i] /= coefficients_sum;
+    }
+
+    priv->window_mem = clCreateBuffer (priv->context,
+                                       CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       sizeof (cl_float) * wsize * wsize,
+                                       coefficients,
+                                       &err);
+    UFO_RESOURCES_CHECK_CLERR (err);
+    g_free (coefficients);
+}
+
+static void
+release_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
+{
+    if (priv->group_sums) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->group_sums));
+        priv->group_sums = NULL;
+    }
+    if (priv->weights_mem) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->weights_mem));
+        priv->weights_mem = NULL;
+    }
+    for (gint i = 0; i < 2; i++) {
+        if (priv->aux_mem[i]) {
+            UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->aux_mem[i]));
+            priv->aux_mem[i] = NULL;
+        }
+    }
+}
+
+static void
+create_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
+{
+    cl_int err;
+    gint local_width = compute_cumsum_local_width (priv);
+
+    priv->group_sums = clCreateBuffer (priv->context,
+                                     CL_MEM_READ_WRITE,
+                                     sizeof (cl_float) * local_width * MAX (priv->padded_size[0], priv->padded_size[1]),
+                                     NULL,
+                                     &err);
+    UFO_RESOURCES_CHECK_CLERR (err);
+
+    priv->weights_mem = clCreateBuffer (priv->context,
+                                        CL_MEM_READ_WRITE,
+                                        sizeof (cl_float) * priv->cropped_size[0] * priv->cropped_size[1],
+                                        NULL,
+                                        &err);
+    for (gint i = 0; i < 2; i++) {
+        priv->aux_mem[i] = clCreateBuffer (priv->context,
+                                           CL_MEM_READ_WRITE,
+                                           sizeof (cl_float) * priv->padded_size[0] * priv->padded_size[1],
+                                           NULL,
+                                           &err);
+        UFO_RESOURCES_CHECK_CLERR (err);
+    }
+    UFO_RESOURCES_CHECK_CLERR (err);
+}
+
+static void
+transpose (UfoNonLocalMeansTaskPrivate *priv,
+           cl_command_queue cmd_queue,
+           UfoProfiler *profiler,
+           cl_mem in_mem,
+           cl_mem out_mem,
+           const gint width,
+           const gint height)
+{
+    gsize global_size[2];
+    gsize local_size[2] = {32, 32 / PIXELS_PER_THREAD};
+    static gint iteration_number = 0;
+
+    while (local_size[0] * local_size[1] > priv->max_work_group_size) {
+        local_size[0] /= 2;
+        local_size[1] /= 2;
+    }
+    global_size[0] = ((width - 1) / local_size[0] + 1) * local_size[0];
+    global_size[1] = ((height - 1) / local_size[1] / PIXELS_PER_THREAD + 1) * local_size[1];
+    if (!iteration_number) {
+        g_debug ("Image size: %d x %d", width, height);
+        g_debug ("Transpose global work group size: %lu x %lu", global_size[0], global_size[1]);
+        g_debug ("Transpose local work group size: %lu x %lu", local_size[0], local_size[1]);
+        iteration_number++;
+    }
+
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 0, sizeof (cl_mem), &in_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 1, sizeof (cl_mem), &out_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 2,
+                               (local_size[0] + 1) * local_size[1] * PIXELS_PER_THREAD * sizeof (cl_float), NULL));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 3, sizeof (cl_int), &width));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 4, sizeof (cl_int), &height));
+    ufo_profiler_call (profiler, cmd_queue, priv->transpose_kernel, 2, global_size, local_size);
+}
+
+static void
+compute_cumsum (UfoNonLocalMeansTaskPrivate *priv,
+                cl_command_queue cmd_queue,
+                UfoProfiler *profiler,
+                cl_mem in_mem,
+                cl_mem out_mem,
+                const gint width,
+                const gint height)
+{
+    gint local_width, num_groups, num_group_iterations, one = 1;
+    gsize cumsum_global_size[2], block_sums_global_size[2], spread_global_size[2],
+          local_size[2], spread_local_size[2];
+    gsize cache_size;
+    static gint iteration_number = 0;
+
+    local_width = compute_cumsum_local_width (priv);
+    /* Number of groups we need to process *width* pixels. If it is more than
+     * *local_width* then use only that number and every group will process more
+     * successive blocks given by group size. This is necessary in order to
+     * avoid recursion in the spreading phase of the group sums. The local cache
+     * memory is limited to *local_width*, every group stores its sum into an
+     * auxiliary buffer, which is then also summed (only one iteration needed,
+     * because there is only one group in the auxiliary buffer since we limit
+     * the number of groups to *local_width*.
+     * This is not be the final number of groups, it's just used to compute the
+     * number of iterations of every group.
+     */
+    num_groups = MIN (local_width, UFO_MATH_NUM_CHUNKS (width, local_width));
+    /* Number of iterations of every group is given by the number of pixels
+     * divided by the number of pixels *num_groups* can process. */
+    num_group_iterations = UFO_MATH_NUM_CHUNKS (width, local_width * num_groups);
+    /* Finally, the real number of groups is given by the number of pixels
+     * divided by the group size and the number of group iterations. */
+    num_groups = UFO_MATH_NUM_CHUNKS (width, num_group_iterations * local_width);
+
+    /* Cache size must be larger by *local_size* / 16 because of the bank
+     * conflicts avoidance. Additionally, +1 is needed because of the shifted
+     * access to the local memory.
+     */
+    cache_size = sizeof (cl_float) * (local_width + UFO_MATH_NUM_CHUNKS (local_width, 16) + 1);
+    cumsum_global_size[0] = num_groups * local_width / 2;
+    cumsum_global_size[1] = height;
+    block_sums_global_size[0] = local_width / 2;
+    block_sums_global_size[1] = height;
+    spread_global_size[0] = (num_groups - 1) * local_width;
+    spread_global_size[1] = height;
+    local_size[0] = local_width / 2;
+    local_size[1] = 1;
+    spread_local_size[0] = local_width;
+    spread_local_size[1] = 1;
+    if (!iteration_number) {
+        g_debug ("           width: %d", width);
+        g_debug ("     local width: %d", local_width);
+        g_debug ("      num groups: %d", num_groups);
+        g_debug ("group iterations: %d", num_group_iterations);
+        g_debug ("     kernel dims: %lu %lu %lu %lu", cumsum_global_size[0], cumsum_global_size[1], local_size[0], local_size[1]);
+        g_debug ("      cache size: %lu", cache_size);
+        iteration_number++;
+    }
+
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &in_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &out_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), &priv->group_sums));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &num_group_iterations));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &width));
+    ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, cumsum_global_size, local_size);
+
+    if (num_groups > 1) {
+        /* If there are more than one group, we need to spread the partial sums
+         * of the groups to successive groups. First compute the sums of the
+         * groups and then spread them to respective pixels in them. Because of
+         * our choice of number of iterations per group above, the partial sums
+         * have to be summed only once without recursion.
+         */
+        /* First sum the partial sums. */
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &priv->group_sums));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &priv->group_sums));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), NULL));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &one));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &local_width));
+        ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, block_sums_global_size, local_size);
+
+        /* Spread them across all pixels. */
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 0, sizeof (cl_mem), &out_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 1, sizeof (cl_mem), &priv->group_sums));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 2, sizeof (gint), &num_group_iterations));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 3, sizeof (gint), &width));
+        ufo_profiler_call (profiler, cmd_queue, priv->spread_kernel, 2, spread_global_size, spread_local_size);
+    }
+}
+
+static void
+compute_sdx (UfoNonLocalMeansTaskPrivate *priv,
+             cl_command_queue cmd_queue,
+             UfoProfiler *profiler,
+             cl_mem in_mem,
+             cl_mem out_mem,
+             const gint dx,
+             const gint dy)
+{
+    gfloat var = priv->sigma * priv->sigma;
+
+    /* First compute the shifted MSE */
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 0, sizeof (cl_mem), &in_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 1, sizeof (cl_mem), &priv->aux_mem[0]));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 2, sizeof (cl_sampler), &priv->sampler));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 3, sizeof (gint), &dx));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 4, sizeof (gint), &dy));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 5, sizeof (gint), &priv->padding));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 6, sizeof (gint), &var));
+    ufo_profiler_call (profiler, cmd_queue, priv->mse_kernel, 2, priv->padded_size, NULL);
+
+    /* Horizontal cumsum and transposition */
+    compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[0], priv->padded_size[1]);
+    transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
+               priv->padded_size[0], priv->padded_size[1]);
+
+    /* 2D cumsum is separable, so compute the horizontal cumsum of the transposed horizontally cumsummed image */
+    compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[1], priv->padded_size[0]);
+    transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
+               priv->padded_size[1], priv->padded_size[0]);
+}
+
+static void
+process_shift (UfoNonLocalMeansTaskPrivate *priv,
+                cl_command_queue cmd_queue,
+                UfoProfiler *profiler,
+                cl_mem input_image,
+                cl_mem integral_mem,
+                cl_mem out_mem,
+                const gint dx,
+                const gint dy)
+{
+    gint patch_size = 2 * priv->patch_radius + 1;
+    gfloat coeff = 1.0f / (priv->h * priv->h * patch_size * patch_size);
+    gint is_conjugate = 0;
+
+    /* Compute r(x) = w(x) * f(x + dx) */
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
+    ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
+
+    if (dx != 0 || dy != 0) {
+        /* Compute r(x + dx) = w(x + dx) * f(x), which cannot be computed at
+         * once in the above kernel call because one work item would write to
+         * two global memory locations, thus creating a race condition. */
+        is_conjugate = 1;
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
+        ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
+    }
+}
+
+static void
+compute_nlm_fast (UfoNonLocalMeansTaskPrivate *priv,
+                  cl_command_queue cmd_queue,
+                  UfoProfiler *profiler,
+                  cl_mem in_mem,
+                  cl_mem out_mem)
+{
+    gint dx, dy, sr;
+    sr = (gint) priv->search_radius;
+
+    /* Every pixel computes its own result and spreads it to the pixel y + dy,
+     * thus we start at dy = 0 every pixel gets its dy < 0 value from the pixel
+     * y - dy. For dy = 0, compute only pixels to the right, so the negative
+     * values are obtained from the pixels to the left (analogous to the dy
+     * case).
+     */
+    for (dy = 0; dy < sr + 1; dy++) {
+        for (dx = dy == 0 ? 0 : -sr; dx < sr + 1; dx++) {
+            compute_sdx (priv, cmd_queue, profiler, in_mem, out_mem, dx, dy);
+            process_shift (priv, cmd_queue, profiler, in_mem, priv->aux_mem[0], out_mem, dx, dy);
+        }
+    }
+    /* Now we have the sum of results and weights, divide them to get the
+     * result.
+     */
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 0, sizeof (cl_mem), &priv->weights_mem));
+    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 1, sizeof (cl_mem), &out_mem));
+    ufo_profiler_call (profiler, cmd_queue, priv->divide_kernel, 2, priv->cropped_size, NULL);
+}
+
 UfoNode *
 ufo_non_local_means_task_new (void)
 {
@@ -63,12 +433,66 @@ ufo_non_local_means_task_setup (UfoTask *task,
                                 GError **error)
 {
     UfoNonLocalMeansTaskPrivate *priv;
+    cl_int err;
 
     priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
-    priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error);
+    if (priv->fast) {
+        priv->mse_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "compute_shifted_mse", NULL, error);
+        priv->cumsum_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "cumsum", NULL, error);
+        priv->spread_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "spread_block_sums", NULL, error);
+        priv->transpose_kernel = ufo_resources_get_kernel (resources, "transpose.cl", "transpose_shared", NULL, error);
+        priv->weight_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "process_shift", NULL, error);
+        priv->divide_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "divide_inplace", NULL, error);
+    } else {
+        priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error);
+    }
+    priv->convolution_kernel = ufo_resources_get_kernel (resources, "estimate-noise.cl", "convolve_abs_laplacian_diff", NULL, error);
+    priv->sum_kernel = ufo_resources_get_kernel (resources, "reductor.cl", "reduce_M_SUM", NULL, error);
+
+    priv->window_mem = NULL;
+    priv->group_sums = NULL;
+    priv->weights_mem = NULL;
+    for (gint i = 0; i < 2; i++) {
+        priv->aux_mem[i] = NULL;
+    }
 
-    if (priv->kernel)
+    if (priv->kernel) {
         UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error);
+    }
+    if (priv->mse_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->mse_kernel), error);
+    }
+    if (priv->cumsum_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->cumsum_kernel), error);
+    }
+    if (priv->spread_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->spread_kernel), error);
+    }
+    if (priv->transpose_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->transpose_kernel), error);
+    }
+    if (priv->weight_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->weight_kernel), error);
+    }
+    if (priv->divide_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error);
+    }
+    if (priv->convolution_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->convolution_kernel), error);
+    }
+    if (priv->sum_kernel) {
+        UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->sum_kernel), error);
+    }
+
+    priv->context = ufo_resources_get_context (resources);
+    UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error);
+
+    priv->sampler = clCreateSampler (priv->context,
+                                     (cl_bool) TRUE,
+                                     priv->addressing_mode,
+                                     CL_FILTER_NEAREST,
+                                     &err);
+    UFO_RESOURCES_CHECK_CLERR (err);
 }
 
 static void
@@ -77,7 +501,35 @@ ufo_non_local_means_task_get_requisition (UfoTask *task,
                                           UfoRequisition *requisition,
                                           GError **error)
 {
+    UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
+    UfoGpuNode *node;
+    GValue *max_work_group_size_gvalue;
+
     ufo_buffer_get_requisition (inputs[0], requisition);
+
+    if (priv->max_work_group_size == 0) {
+        node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
+        max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE);
+        priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue);
+        g_value_unset (max_work_group_size_gvalue);
+    }
+    if (!priv->fast && priv->use_window && !priv->window_mem) {
+        create_gaussian_window (priv);
+    }
+    if (priv->cropped_size[0] != requisition->dims[0] || priv->cropped_size[1] != requisition->dims[1]) {
+        priv->cropped_size[0] = requisition->dims[0];
+        priv->cropped_size[1] = requisition->dims[1];
+        if (priv->fast) {
+            priv->padding = priv->search_radius + priv->patch_radius + 1;
+            priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding;
+            priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding;
+            if (priv->group_sums) {
+                /* Buffers exist, which means size changed, release first. */
+                release_fast_buffers (priv);
+            }
+            create_fast_buffers (priv);
+        }
+    }
 }
 
 static guint
@@ -108,24 +560,69 @@ ufo_non_local_means_task_process (UfoTask *task,
     UfoNonLocalMeansTaskPrivate *priv;
     UfoGpuNode *node;
     UfoProfiler *profiler;
+    UfoRequisition in_req;
     cl_command_queue cmd_queue;
     cl_mem in_mem;
     cl_mem out_mem;
+    gfloat h, var, estimated_sigma;
+    gfloat fill_pattern = 0.0f;
+    guint num_processed;
 
     priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
     node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
     cmd_queue = ufo_gpu_node_get_cmd_queue (node);
-    in_mem = ufo_buffer_get_device_array (inputs[0], cmd_queue);
+    in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue);
     out_mem = ufo_buffer_get_device_array (output, cmd_queue);
-
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (guint), &priv->search_radius));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->patch_radius));
-    UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (gfloat), &priv->sigma));
-
     profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
-    ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL);
+    ufo_buffer_get_requisition (inputs[0], &in_req);
+    g_object_get (task, "num_processed", &num_processed, NULL);
+
+    /* Estimate sigma only from the first image in the stream in order to
+     * keep the results consistent.
+     */
+    if (num_processed == 0 && (priv->h <= 0.0f || priv->estimate_sigma)) {
+        /* Use out_mem for the convolution, it's not necessary after the
+         * computation and can be re-used by the de-noising itself.
+         */
+        estimated_sigma = ufo_common_estimate_sigma (priv->convolution_kernel,
+                                                     priv->sum_kernel,
+                                                     cmd_queue,
+                                                     priv->sampler,
+                                                     profiler,
+                                                     in_mem,
+                                                     out_mem,
+                                                     priv->max_work_group_size,
+                                                     priv->cropped_size);
+        g_debug ("Estimated sigma: %g", estimated_sigma);
+        if (priv->h <= 0.0f) {
+            priv->h = estimated_sigma;
+        }
+        if (priv->estimate_sigma) {
+            priv->sigma = estimated_sigma;
+        }
+    }
+    h = 1 / priv->h / priv->h;
+    var = priv->sigma * priv->sigma;
+
+    if (priv->fast) {
+        UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0,
+                                                        sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
+                                                        0, NULL, NULL));
+        UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, priv->weights_mem, &fill_pattern, sizeof (gfloat), 0,
+                                                        sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
+                                                        0, NULL, NULL));
+        compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem);
+    } else {
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var));
+        UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem));
+        ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL);
+    }
 
     return TRUE;
 }
@@ -144,20 +641,26 @@ ufo_non_local_means_task_set_property (GObject *object,
             priv->search_radius = g_value_get_uint (value);
             break;
         case PROP_PATCH_RADIUS:
-            {
-                guint v = g_value_get_uint (value);
-
-                if (!(v % 2)) {
-                    g_printerr ("Patch radius must be odd, increasing by one\n");
-                    v++;
-                }
-
-                priv->patch_radius = v;
-            }
+            priv->patch_radius = g_value_get_uint (value);
+            break;
+        case PROP_H:
+            priv->h = g_value_get_float (value);
             break;
         case PROP_SIGMA:
             priv->sigma = g_value_get_float (value);
             break;
+        case PROP_WINDOW:
+            priv->use_window = g_value_get_boolean (value);
+            break;
+        case PROP_FAST:
+            priv->fast = g_value_get_boolean (value);
+            break;
+        case PROP_ESTIMATE_SIGMA:
+            priv->estimate_sigma = g_value_get_boolean (value);
+            break;
+        case PROP_ADDRESSING_MODE:
+            priv->addressing_mode = g_value_get_enum (value);
+            break;
         default:
             G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
             break;
@@ -179,9 +682,24 @@ ufo_non_local_means_task_get_property (GObject *object,
         case PROP_PATCH_RADIUS:
             g_value_set_uint (value, priv->patch_radius);
             break;
+        case PROP_H:
+            g_value_set_float (value, priv->h);
+            break;
         case PROP_SIGMA:
             g_value_set_float (value, priv->sigma);
             break;
+        case PROP_WINDOW:
+            g_value_set_boolean (value, priv->use_window);
+            break;
+        case PROP_FAST:
+            g_value_set_boolean (value, priv->fast);
+            break;
+        case PROP_ESTIMATE_SIGMA:
+            g_value_set_boolean (value, priv->estimate_sigma);
+            break;
+        case PROP_ADDRESSING_MODE:
+            g_value_set_enum (value, priv->addressing_mode);
+            break;
         default:
             G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
             break;
@@ -199,6 +717,48 @@ ufo_non_local_means_task_finalize (GObject *object)
         UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel));
         priv->kernel = NULL;
     }
+    if (priv->mse_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->mse_kernel));
+        priv->mse_kernel = NULL;
+    }
+    if (priv->cumsum_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->cumsum_kernel));
+        priv->cumsum_kernel = NULL;
+    }
+    if (priv->spread_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->spread_kernel));
+        priv->spread_kernel = NULL;
+    }
+    if (priv->transpose_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->transpose_kernel));
+        priv->transpose_kernel = NULL;
+    }
+    if (priv->weight_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->weight_kernel));
+        priv->weight_kernel = NULL;
+    }
+    if (priv->divide_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel));
+        priv->divide_kernel = NULL;
+    }
+    if (priv->convolution_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->convolution_kernel));
+        priv->convolution_kernel = NULL;
+    }
+    if (priv->sum_kernel) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->sum_kernel));
+        priv->sum_kernel = NULL;
+    }
+    if (priv->sampler) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler));
+        priv->sampler = NULL;
+    }
+    if (priv->context) {
+        UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context));
+        priv->context = NULL;
+    }
+    release_gaussian_window (priv);
+    release_fast_buffers (priv);
 
     G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object);
 }
@@ -227,21 +787,57 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass)
         g_param_spec_uint ("search-radius",
             "Search radius in pixels",
             "Search radius in pixels",
-            10, 8192, 10,
+            1, 8192, 10,
             G_PARAM_READWRITE);
 
     properties[PROP_PATCH_RADIUS] =
         g_param_spec_uint ("patch-radius",
-            "Search radius in pixels",
-            "Search radius in pixels",
-            3, 27, 3,
+            "Patch radius in pixels",
+            "Patch radius in pixels",
+            1, 100, 3,
+            G_PARAM_READWRITE);
+
+    properties[PROP_H] =
+        g_param_spec_float ("h",
+            "Smoothing control parameter, should be around noise standard deviation or slightly less",
+            "Smoothing control parameter, should be around noise standard deviation or slightly less",
+            0.0f, G_MAXFLOAT, 0.1f,
             G_PARAM_READWRITE);
 
     properties[PROP_SIGMA] =
         g_param_spec_float ("sigma",
-            "Sigma",
-            "Sigma",
-            0.0f, G_MAXFLOAT, 0.1f,
+            "Noise standard deviation",
+            "Noise standard deviation",
+            0.0f, G_MAXFLOAT, 0.0f,
+            G_PARAM_READWRITE);
+
+    properties[PROP_WINDOW] =
+        g_param_spec_boolean ("window",
+            "Use Gaussian window by computing patch weights",
+            "Use Gaussian window by computing patch weights",
+            TRUE,
+            G_PARAM_READWRITE);
+
+    properties[PROP_FAST] =
+        g_param_spec_boolean ("fast",
+            "Use a faster version of the algorithm",
+            "Use a faster version of the algorithm",
+            TRUE,
+            G_PARAM_READWRITE);
+
+    properties[PROP_ESTIMATE_SIGMA] =
+        g_param_spec_boolean ("estimate-sigma",
+            "Estimate noise standard deviation",
+            "Estimate noise standard deviation",
+            FALSE,
+            G_PARAM_READWRITE);
+
+    properties[PROP_ADDRESSING_MODE] =
+        g_param_spec_enum ("addressing-mode",
+            "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")",
+            "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")",
+            g_enum_register_static ("nlm_addressing_mode", addressing_values),
+            CL_ADDRESS_MIRRORED_REPEAT,
             G_PARAM_READWRITE);
 
     for (guint i = PROP_0 + 1; i < N_PROPERTIES; i++)
@@ -257,5 +853,16 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self)
 
     self->priv->search_radius = 10;
     self->priv->patch_radius = 3;
-    self->priv->sigma = 0.1f;
+    self->priv->cropped_size[0] = 0;
+    self->priv->cropped_size[1] = 0;
+    self->priv->padded_size[0] = 0;
+    self->priv->padded_size[1] = 0;
+    self->priv->padding = -1;
+    self->priv->h = 0.0f;
+    self->priv->sigma = 0.0f;
+    self->priv->max_work_group_size = 0;
+    self->priv->use_window = TRUE;
+    self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT;
+    self->priv->fast = TRUE;
+    self->priv->estimate_sigma = FALSE;
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ddb567c..8de94f1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -20,3 +20,6 @@ add_test(test_177
 
 add_test(test_core_149
          ${BASH} "${CMAKE_CURRENT_SOURCE_DIR}/test-core-149.sh")
+
+add_test(test_nlm
+         ${BASH} "${CMAKE_CURRENT_SOURCE_DIR}/test-nlm.sh")
diff --git a/tests/meson.build b/tests/meson.build
index f95722f..6b720d2 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -3,7 +3,8 @@ tests = [
     'test-153',
     'test-161',
     'test-core-149',
-    'test-file-write-regression'
+    'test-file-write-regression',
+    'test-nlm'
 ]
 
 tiffinfo = find_program('tiffinfo', required : false)
diff --git a/tests/test-nlm.sh b/tests/test-nlm.sh
new file mode 100755
index 0000000..09b75dd
--- /dev/null
+++ b/tests/test-nlm.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+python -c "import numpy; import tifffile; tifffile.imsave('nlm-noise-input.tif',
+numpy.random.normal(100., 10., size=(128, 128)).astype(numpy.float32))"
+
+ufo-launch -q read path=nlm-noise-input.tif ! non-local-means patch-radius=3 search-radius=5 fast=False estimate-sigma=True addressing-mode=mirrored_repeat window=False ! write filename=ufo-nlm-slow.tif
+
+ufo-launch -q read path=nlm-noise-input.tif ! non-local-means patch-radius=3 search-radius=5 fast=True estimate-sigma=True addressing-mode=mirrored_repeat !  write filename=ufo-nlm-fast.tif
+
+# Fast and slow difference with respect to the mean must be less than 0.001 %
+python -c "import sys; import tifffile; a = tifffile.imread('ufo-nlm-slow.tif'); b = tifffile.imread('ufo-nlm-fast.tif'); sys.exit(int(abs(a - b).max() > 1e-3))"
author	Tomas Farago <sensej007@email.cz>	2020-02-05 10:27:41 +0100
committer	GitHub <noreply@github.com>	2020-02-05 10:27:41 +0100
commit	076ef3b717c58ae1fca593ab936c649aa14936a9 (patch)
tree	73d463b635762db907c61fd1737eb4040d9661b6
parent	796ba68cfae854736c0adae6cdb56f93ef525e64 (diff)
parent	c94277081c5c0d6cb9f803bc96523dc7ff73a53a (diff)
download	ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.gz ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.bz2 ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.tar.xz ufo-filters-076ef3b717c58ae1fca593ab936c649aa14936a9.zip