1 files changed, 124 insertions, 37 deletions
diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl
index 8b1bcb8..a094f34 100644
--- a/src/kernels/nlm.cl
+++ b/src/kernels/nlm.cl
@@ -17,63 +17,150 @@
  * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#define flatten(x,y,r,w) ((y-r)*w + (x-r))
-
-/* Compute the distance of two neighbourhood vectors _starting_ from index i
-   and j and edge length radius */
 float
-dist (global float *input,
-      int i,
-      int j,
-      int radius,
-      int image_width)
+compute_dist (read_only image2d_t input,
+              sampler_t sampler,
+              float2 p,
+              float2 q,
+              int radius,
+              int width,
+              int height,
+              float h_2,
+              float variance,
+              constant float *window_coeffs)
 {
     float dist = 0.0f, tmp;
-    float wsize = (2.0f * radius + 1.0f);
-    wsize *= wsize;
-
-    const int nb_width = 2 * radius + 1;
-    const int stride = image_width - nb_width;
-    for (int k = 0; k < nb_width; k++, i += stride, j += stride) {
-        for (int l = 0; l < nb_width; l++, i++, j++) {
-            tmp = input[i] - input[j];
-            dist += tmp * tmp;
+    int wsize = (2 * radius + 1);
+    float coeff = h_2;
+
+    if (!window_coeffs) {
+        /* Gaussian window is normalized to sum=1, so if it is used, we are done
+         * with just summation. If it is not, we need to compute the mean. */
+        coeff /= wsize * wsize;
+    }
+
+    for (int j = -radius; j < radius + 1; j++) {
+        for (int i = -radius; i < radius + 1; i++) {
+            tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x -
+                    read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x;
+            if (window_coeffs) {
+                dist += window_coeffs[(j + radius) * wsize + (i + radius)] * (tmp * tmp - 2 * variance);
+            } else {
+                dist += tmp * tmp - 2 * variance;
+            }
         }
     }
-    return dist / wsize;
+
+    return fmax (0.0f, dist) * coeff;
+}
+
+kernel void
+compute_shifted_mse (read_only image2d_t input,
+                     global float *output,
+                     sampler_t sampler,
+                     const int dx,
+                     const int dy,
+                     const int padding,
+                     const float variance)
+{
+    /* Global dimensions are for the padded output image */
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    int padded_width = get_global_size (0);
+    /* Image is padded by *padding* on both sides of both dimensions. */
+    float cropped_width = (float) (padded_width - 2 * padding);
+    float cropped_height = (float) (get_global_size (1) - 2 * padding);
+    float x = ((float) (idx - padding)) + 0.5f;
+    float y = ((float) (idy - padding)) + 0.5f;
+    float a, b;
+
+    a = read_imagef (input, sampler, (float2) (x / cropped_width, y / cropped_height)).x;
+    b = read_imagef (input, sampler, (float2) ((x + dx) / cropped_width, (y + dy) / cropped_height)).x;
+
+    output[idy * padded_width + idx] = (a - b) * (a - b) - 2 * variance;
+}
+
+kernel void
+process_shift (read_only image2d_t input,
+               global float *integral_input,
+               global float *weights,
+               global float *output,
+               const sampler_t sampler,
+               const int dx,
+               const int dy,
+               const int patch_radius,
+               const int padding,
+               const float coeff,
+               const int is_conjugate)
+{
+    /* Global dimensions are for the cropped output image */
+    int idx = get_global_id (0);
+    int idy = get_global_id (1);
+    /* Image is padded by *padding* on both sides of both dimensions. */
+    int x = idx + padding;
+    int y = idy + padding;
+    float x_im = idx + dx + 0.5f;
+    float y_im = idy + dy + 0.5f;
+    int cropped_width = get_global_size (0);
+    int cropped_height = get_global_size (1);
+    int padded_width = cropped_width + 2 * padding;
+    float dist, weight;
+    if (is_conjugate) {
+        /* direct computation: g(x) = w(x) + f(x + dx)
+         * conjugate: g(x + dx) = w(x) + f(x), where idx = x + dx, so
+         * g(idx) = w(idx - dx) * f(idx - x) = w(idx - dx) * f(x_im - 2dx)
+         */
+        x -= dx;
+        y -= dy;
+        x_im -= 2 * dx;
+        y_im -= 2 * dy;
+    }
+
+    dist = integral_input[(y + patch_radius) * padded_width + x + patch_radius] -
+           integral_input[(y - patch_radius - 1) * padded_width + x + patch_radius] -
+           integral_input[(y + patch_radius) * padded_width + x - patch_radius - 1] +
+           integral_input[(y - patch_radius - 1) * padded_width + x - patch_radius - 1];
+    dist = fmax (0.0f, dist) * coeff;
+    weight = exp (-dist);
+
+    weights[idy * cropped_width + idx] += weight;
+    output[idy * cropped_width + idx] += weight * read_imagef (input, sampler, (float2) (x_im / cropped_width,
+                                                                                         y_im / cropped_height)).x;
+}
+
+kernel void divide_inplace (global float *coefficients,
+                            global float *output)
+{
+    int index = get_global_size (0) * get_global_id (1) + get_global_id (0);
+
+    output[index] = native_divide (output[index], coefficients[index]);
 }
 
 kernel void
-nlm_noise_reduction (global float *input,
+nlm_noise_reduction (read_only image2d_t input,
                      global float *output,
+                     sampler_t sampler,
                      const int search_radius,
                      const int patch_radius,
-                     const float sigma)
+                     const float h_2,
+                     const float variance,
+                     constant float *window_coeffs)
 {
     const int x = get_global_id (0);
     const int y = get_global_id (1);
     const int width = get_global_size (0);
     const int height = get_global_size (1);
-    const float sigma_2 = sigma * sigma;
+    float d, weight;
 
     float total_weight = 0.0f;
     float pixel_value = 0.0f;
 
-    /* 
-     * Compute the upper left (sx,sy) and lower right (tx, ty) corner points of
-     * our search window.
-     */
-    int r = min (patch_radius, min(width - 1 - x, min (height - 1 - y, min (x, y))));
-    int sx = max (x - search_radius, r);
-    int sy = max (y - search_radius, r);
-    int tx = min (x + search_radius, width - 1 - r);
-    int ty = min (y + search_radius, height - 1 - r);
-
-    for (int i = sx; i < tx; i++) {
-        for (int j = sy; j < ty; j++) {
-            float d = dist (input, flatten(x, y, r, width), flatten (i,j,r,width), r, width);
-            float weight = exp (- sigma_2 * d);
-            pixel_value += weight * input[j * width + i];
+    for (int j = y - search_radius; j < y + search_radius + 1; j++) {
+        for (int i = x - search_radius; i < x + search_radius + 1; i++) {
+            d = compute_dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f),
+                      patch_radius, width, height, h_2, variance, window_coeffs);
+            weight = exp (-d);
+            pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x;
             total_weight += weight;
         }
     }