From a10789e252b6dd849f7d8fee016d57132562afad Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 12:19:22 +0200 Subject: NLM: use texture memory --- docs/filters.rst | 5 +++++ src/kernels/nlm.cl | 49 +++++++++++++++++------------------------- src/ufo-non-local-means-task.c | 48 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 69 insertions(+), 33 deletions(-) diff --git a/docs/filters.rst b/docs/filters.rst index c04b536..530ca67 100644 --- a/docs/filters.rst +++ b/docs/filters.rst @@ -542,6 +542,11 @@ Non-local-means denoising Sigma influencing the Gaussian weighting. + .. gobj:prop:: addressing-mode:enum + + Addressing mode specifies the behavior for pixels falling outside the + original image. See OpenCL ``sampler_t`` documentation for more information. + Horizontal interpolation ------------------------ diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index 8b1bcb8..f904ac7 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -17,35 +17,34 @@ * License along with this library. If not, see . */ -#define flatten(x,y,r,w) ((y-r)*w + (x-r)) - -/* Compute the distance of two neighbourhood vectors _starting_ from index i - and j and edge length radius */ float -dist (global float *input, - int i, - int j, +dist (read_only image2d_t input, + sampler_t sampler, + float2 p, + float2 q, int radius, - int image_width) + int width, + int height) { float dist = 0.0f, tmp; float wsize = (2.0f * radius + 1.0f); wsize *= wsize; - const int nb_width = 2 * radius + 1; - const int stride = image_width - nb_width; - for (int k = 0; k < nb_width; k++, i += stride, j += stride) { - for (int l = 0; l < nb_width; l++, i++, j++) { - tmp = input[i] - input[j]; + for (int i = -radius; i < radius + 1; i++) { + for (int j = -radius; j < radius + 1; j++) { + tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x - + read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x; dist += tmp * tmp; } } + return dist / wsize; } kernel void -nlm_noise_reduction (global float *input, +nlm_noise_reduction (read_only image2d_t input, global float *output, + sampler_t sampler, const int search_radius, const int patch_radius, const float sigma) @@ -55,25 +54,17 @@ nlm_noise_reduction (global float *input, const int width = get_global_size (0); const int height = get_global_size (1); const float sigma_2 = sigma * sigma; + float d, weight; float total_weight = 0.0f; float pixel_value = 0.0f; - /* - * Compute the upper left (sx,sy) and lower right (tx, ty) corner points of - * our search window. - */ - int r = min (patch_radius, min(width - 1 - x, min (height - 1 - y, min (x, y)))); - int sx = max (x - search_radius, r); - int sy = max (y - search_radius, r); - int tx = min (x + search_radius, width - 1 - r); - int ty = min (y + search_radius, height - 1 - r); - - for (int i = sx; i < tx; i++) { - for (int j = sy; j < ty; j++) { - float d = dist (input, flatten(x, y, r, width), flatten (i,j,r,width), r, width); - float weight = exp (- sigma_2 * d); - pixel_value += weight * input[j * width + i]; + for (int i = x - search_radius; i < x + search_radius; i++) { + for (int j = y - search_radius; j < y + search_radius; j++) { + d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), + patch_radius, width, height); + weight = exp (- sigma_2 * d); + pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x; total_weight += weight; } } diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 650cecc..76a07ac 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -24,6 +24,7 @@ #endif #include "ufo-non-local-means-task.h" +#include "common/ufo-addressing.h" struct _UfoNonLocalMeansTaskPrivate { @@ -31,6 +32,9 @@ struct _UfoNonLocalMeansTaskPrivate { guint patch_radius; gfloat sigma; cl_kernel kernel; + cl_sampler sampler; + cl_context context; + AddressingMode addressing_mode; }; static void ufo_task_interface_init (UfoTaskIface *iface); @@ -46,6 +50,7 @@ enum { PROP_SEARCH_RADIUS, PROP_PATCH_RADIUS, PROP_SIGMA, + PROP_ADDRESSING_MODE, N_PROPERTIES }; @@ -63,12 +68,23 @@ ufo_non_local_means_task_setup (UfoTask *task, GError **error) { UfoNonLocalMeansTaskPrivate *priv; + cl_int err; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); if (priv->kernel) UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error); + + priv->context = ufo_resources_get_context (resources); + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error); + + priv->sampler = clCreateSampler (priv->context, + (cl_bool) TRUE, + priv->addressing_mode, + CL_FILTER_NEAREST, + &err); + UFO_RESOURCES_CHECK_CLERR (err); } static void @@ -115,14 +131,15 @@ ufo_non_local_means_task_process (UfoTask *task, priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); - in_mem = ufo_buffer_get_device_array (inputs[0], cmd_queue); + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (guint), &priv->search_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->patch_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (gfloat), &priv->sigma)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &priv->sigma)); profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); @@ -158,6 +175,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_SIGMA: priv->sigma = g_value_get_float (value); break; + case PROP_ADDRESSING_MODE: + priv->addressing_mode = g_value_get_enum (value); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec); break; @@ -182,6 +202,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_SIGMA: g_value_set_float (value, priv->sigma); break; + case PROP_ADDRESSING_MODE: + g_value_set_enum (value, priv->addressing_mode); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec); break; @@ -199,6 +222,14 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel)); priv->kernel = NULL; } + if (priv->sampler) { + UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler)); + priv->sampler = NULL; + } + if (priv->context) { + UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context)); + priv->context = NULL; + } G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object); } @@ -244,6 +275,14 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) 0.0f, G_MAXFLOAT, 0.1f, G_PARAM_READWRITE); + properties[PROP_ADDRESSING_MODE] = + g_param_spec_enum ("addressing-mode", + "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", + "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", + g_enum_register_static ("nlm_addressing_mode", addressing_values), + CL_ADDRESS_MIRRORED_REPEAT, + G_PARAM_READWRITE); + for (guint i = PROP_0 + 1; i < N_PROPERTIES; i++) g_object_class_install_property (oclass, i, properties[i]); @@ -258,4 +297,5 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->search_radius = 10; self->priv->patch_radius = 3; self->priv->sigma = 0.1f; + self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; } -- cgit v1.2.1 From b803bba7305b86b5484d67ed942fd46c5c72f05f Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 12:20:28 +0200 Subject: NLM: search the same amount in both directions --- src/kernels/nlm.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index f904ac7..c3ae3f3 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -59,8 +59,8 @@ nlm_noise_reduction (read_only image2d_t input, float total_weight = 0.0f; float pixel_value = 0.0f; - for (int i = x - search_radius; i < x + search_radius; i++) { - for (int j = y - search_radius; j < y + search_radius; j++) { + for (int i = x - search_radius; i < x + search_radius + 1; i++) { + for (int j = y - search_radius; j < y + search_radius + 1; j++) { d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), patch_radius, width, height); weight = exp (- sigma_2 * d); -- cgit v1.2.1 From b0820597946b6b22463e08d340c04dcaa75448ca Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 12:21:04 +0200 Subject: NLM: relax parameter limitations --- src/ufo-non-local-means-task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 76a07ac..ee3b2e9 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -258,14 +258,14 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) g_param_spec_uint ("search-radius", "Search radius in pixels", "Search radius in pixels", - 10, 8192, 10, + 1, 8192, 10, G_PARAM_READWRITE); properties[PROP_PATCH_RADIUS] = g_param_spec_uint ("patch-radius", "Search radius in pixels", "Search radius in pixels", - 3, 27, 3, + 1, 100, 3, G_PARAM_READWRITE); properties[PROP_SIGMA] = -- cgit v1.2.1 From 0a427ea1dbba853f7c5e411c628119bb21672aea Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 12:21:41 +0200 Subject: NLM: Fix copy-paste parameter description error --- src/ufo-non-local-means-task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index ee3b2e9..405b2d1 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -263,8 +263,8 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) properties[PROP_PATCH_RADIUS] = g_param_spec_uint ("patch-radius", - "Search radius in pixels", - "Search radius in pixels", + "Patch radius in pixels", + "Patch radius in pixels", 1, 100, 3, G_PARAM_READWRITE); -- cgit v1.2.1 From 456a63362c84b3295ddc0445531d338c62d48fd8 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 12:23:24 +0200 Subject: NLM: Use sigma as in the literature i.e. pass 1/sigma to the kernel because there it's used in a multiplicative way. --- src/ufo-non-local-means-task.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 405b2d1..1192320 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -127,19 +127,21 @@ ufo_non_local_means_task_process (UfoTask *task, cl_command_queue cmd_queue; cl_mem in_mem; cl_mem out_mem; + gfloat sigma; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); + sigma = 1 / priv->sigma; UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &priv->sigma)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &sigma)); profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); -- cgit v1.2.1 From 7359279146b30c6daeeaf4132fe34452085659df Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 12 Jul 2019 17:00:05 +0200 Subject: NLM: remove odd patch-radius constraint patch size is always 2 * patch-radius + 1, so no matter what the parameter is, the size is always odd. --- src/ufo-non-local-means-task.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 1192320..b953b1c 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -163,16 +163,7 @@ ufo_non_local_means_task_set_property (GObject *object, priv->search_radius = g_value_get_uint (value); break; case PROP_PATCH_RADIUS: - { - guint v = g_value_get_uint (value); - - if (!(v % 2)) { - g_printerr ("Patch radius must be odd, increasing by one\n"); - v++; - } - - priv->patch_radius = v; - } + priv->patch_radius = g_value_get_uint (value); break; case PROP_SIGMA: priv->sigma = g_value_get_float (value); -- cgit v1.2.1 From 3e58e1e63d48f6d2809d1f7f2432de87d96c85c3 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Mon, 15 Jul 2019 08:39:21 +0200 Subject: NLM: add smoothing control parameter *h* Which is what *sigma* used to do, but in fact it should have been used to improve weights computation. --- docs/filters.rst | 8 +++++++- src/kernels/nlm.cl | 13 +++++++------ src/ufo-non-local-means-task.c | 32 +++++++++++++++++++++++++------- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/docs/filters.rst b/docs/filters.rst index 530ca67..9d6fed0 100644 --- a/docs/filters.rst +++ b/docs/filters.rst @@ -538,9 +538,15 @@ Non-local-means denoising Radius of patches. + .. gobj:prop:: h:float + + Smoothing control parameter, should be around noise standard deviation + or slightly less. Higher h results in a smoother image but with blurred + features. + .. gobj:prop:: sigma:float - Sigma influencing the Gaussian weighting. + Noise standard deviation, improves weights computation. .. gobj:prop:: addressing-mode:enum diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index c3ae3f3..df96fbd 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -24,7 +24,8 @@ dist (read_only image2d_t input, float2 q, int radius, int width, - int height) + int height, + float variance) { float dist = 0.0f, tmp; float wsize = (2.0f * radius + 1.0f); @@ -34,7 +35,7 @@ dist (read_only image2d_t input, for (int j = -radius; j < radius + 1; j++) { tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x - read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x; - dist += tmp * tmp; + dist += fmax (0.0f, tmp * tmp - 2 * variance); } } @@ -47,13 +48,13 @@ nlm_noise_reduction (read_only image2d_t input, sampler_t sampler, const int search_radius, const int patch_radius, - const float sigma) + const float h_2, + const float variance) { const int x = get_global_id (0); const int y = get_global_id (1); const int width = get_global_size (0); const int height = get_global_size (1); - const float sigma_2 = sigma * sigma; float d, weight; float total_weight = 0.0f; @@ -62,8 +63,8 @@ nlm_noise_reduction (read_only image2d_t input, for (int i = x - search_radius; i < x + search_radius + 1; i++) { for (int j = y - search_radius; j < y + search_radius + 1; j++) { d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), - patch_radius, width, height); - weight = exp (- sigma_2 * d); + patch_radius, width, height, variance); + weight = exp (- h_2 * d); pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x; total_weight += weight; } diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index b953b1c..764778d 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -30,6 +30,7 @@ struct _UfoNonLocalMeansTaskPrivate { guint search_radius; guint patch_radius; + gfloat h; gfloat sigma; cl_kernel kernel; cl_sampler sampler; @@ -49,6 +50,7 @@ enum { PROP_0, PROP_SEARCH_RADIUS, PROP_PATCH_RADIUS, + PROP_H, PROP_SIGMA, PROP_ADDRESSING_MODE, N_PROPERTIES @@ -127,21 +129,23 @@ ufo_non_local_means_task_process (UfoTask *task, cl_command_queue cmd_queue; cl_mem in_mem; cl_mem out_mem; - gfloat sigma; + gfloat h, var; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); - sigma = 1 / priv->sigma; + h = 1 / priv->h / priv->h; + var = priv->sigma * priv->sigma; UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &sigma)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); @@ -165,6 +169,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_PATCH_RADIUS: priv->patch_radius = g_value_get_uint (value); break; + case PROP_H: + priv->h = g_value_get_float (value); + break; case PROP_SIGMA: priv->sigma = g_value_get_float (value); break; @@ -192,6 +199,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_PATCH_RADIUS: g_value_set_uint (value, priv->patch_radius); break; + case PROP_H: + g_value_set_float (value, priv->h); + break; case PROP_SIGMA: g_value_set_float (value, priv->sigma); break; @@ -261,11 +271,18 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) 1, 100, 3, G_PARAM_READWRITE); + properties[PROP_H] = + g_param_spec_float ("h", + "Smoothing control parameter, should be around noise standard deviation or slightly less", + "Smoothing control parameter, should be around noise standard deviation or slightly less", + 0.0f, G_MAXFLOAT, 0.1f, + G_PARAM_READWRITE); + properties[PROP_SIGMA] = g_param_spec_float ("sigma", - "Sigma", - "Sigma", - 0.0f, G_MAXFLOAT, 0.1f, + "Noise standard deviation", + "Noise standard deviation", + 0.0f, G_MAXFLOAT, 0.0f, G_PARAM_READWRITE); properties[PROP_ADDRESSING_MODE] = @@ -289,6 +306,7 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->search_radius = 10; self->priv->patch_radius = 3; - self->priv->sigma = 0.1f; + self->priv->h = 0.1f; + self->priv->sigma = 0.0f; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; } -- cgit v1.2.1 From 4539cb2452c5651023286483ce4b6e142cca742e Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Mon, 15 Jul 2019 15:32:26 +0200 Subject: NLM: add gaussian window --- docs/filters.rst | 7 ++++ src/kernels/nlm.cl | 31 ++++++++++++----- src/ufo-non-local-means-task.c | 79 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 8 deletions(-) diff --git a/docs/filters.rst b/docs/filters.rst index 9d6fed0..d0ef056 100644 --- a/docs/filters.rst +++ b/docs/filters.rst @@ -548,6 +548,13 @@ Non-local-means denoising Noise standard deviation, improves weights computation. + .. gobj:prop:: window:boolean + + Apply Gaussian profile with :math:`\sigma = \frac{P}{2}`, where :math:`P` + is the gobj:prop:`patch-radius` parameter to the weight computation + which decreases the influence of pixels towards the corners of the + patches. + .. gobj:prop:: addressing-mode:enum Addressing mode specifies the behavior for pixels falling outside the diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index df96fbd..852c90c 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -25,21 +25,35 @@ dist (read_only image2d_t input, int radius, int width, int height, - float variance) + float h_2, + float variance, + constant float *window_coeffs) { float dist = 0.0f, tmp; - float wsize = (2.0f * radius + 1.0f); - wsize *= wsize; + int wsize = (2 * radius + 1); + float coeff = h_2; + + if (!window_coeffs) { + /* Gaussian window is normalized to sum=1, so if it is used, we are done + * with just summation. If it is not, we need to compute the mean. */ + coeff /= wsize * wsize; + } for (int i = -radius; i < radius + 1; i++) { for (int j = -radius; j < radius + 1; j++) { tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x - read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x; - dist += fmax (0.0f, tmp * tmp - 2 * variance); + if (window_coeffs) { + /* Use gaussian window. + * Cutoff negative numbers which would cause large weights. */ + dist += fmax (0.0f, window_coeffs[(j + radius) * wsize + (i + radius)] * (tmp * tmp - 2 * variance)); + } else { + dist += fmax (0.0f, tmp * tmp - 2 * variance); + } } } - return dist / wsize; + return dist * coeff; } kernel void @@ -49,7 +63,8 @@ nlm_noise_reduction (read_only image2d_t input, const int search_radius, const int patch_radius, const float h_2, - const float variance) + const float variance, + constant float *window_coeffs) { const int x = get_global_id (0); const int y = get_global_id (1); @@ -63,8 +78,8 @@ nlm_noise_reduction (read_only image2d_t input, for (int i = x - search_radius; i < x + search_radius + 1; i++) { for (int j = y - search_radius; j < y + search_radius + 1; j++) { d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), - patch_radius, width, height, variance); - weight = exp (- h_2 * d); + patch_radius, width, height, h_2, variance, window_coeffs); + weight = exp (-d); pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x; total_weight += weight; } diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 764778d..ee9f2ad 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -22,6 +22,7 @@ #else #include #endif +#include #include "ufo-non-local-means-task.h" #include "common/ufo-addressing.h" @@ -32,9 +33,11 @@ struct _UfoNonLocalMeansTaskPrivate { guint patch_radius; gfloat h; gfloat sigma; + gboolean use_window; cl_kernel kernel; cl_sampler sampler; cl_context context; + cl_mem window_mem; AddressingMode addressing_mode; }; @@ -52,12 +55,67 @@ enum { PROP_PATCH_RADIUS, PROP_H, PROP_SIGMA, + PROP_WINDOW, PROP_ADDRESSING_MODE, N_PROPERTIES }; static GParamSpec *properties[N_PROPERTIES] = { NULL, }; +/** + * release_coefficients: + * @priv: UfoNonLocalMeansTaskPrivate + * + * Release Gaussian window coefficients memory object. + */ +static void +release_coefficients (UfoNonLocalMeansTaskPrivate *priv) +{ + if (priv->window_mem) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->window_mem)); + priv->window_mem = NULL; + } +} + +/** + * create_coefficients: + * @priv: UfoNonLocalMeansTaskPrivate + * + * Compute Gaussian window coefficients. + */ +static void +create_coefficients (UfoNonLocalMeansTaskPrivate *priv) +{ + cl_int err; + gfloat *coefficients, coefficients_sum = 0.0f, sigma; + gsize wsize; + gint r; + + wsize = 2 * priv->patch_radius + 1; + r = (gint) priv->patch_radius; + coefficients = g_malloc0 (sizeof (gfloat) * wsize * wsize); + sigma = priv->patch_radius / 2.0f; + + for (gint y = 0; y < (gint) wsize; y++) { + for (gint x = 0; x < (gint) wsize; x++) { + coefficients[y * wsize + x] = exp (- ((x - r) * (x - r) + (y - r) * (y - r)) / (sigma * sigma * 2.0f)); + coefficients_sum += coefficients[y * wsize + x]; + } + } + for (guint i = 0; i < wsize * wsize; i++) { + coefficients[i] /= coefficients_sum; + } + + release_coefficients (priv); + priv->window_mem = clCreateBuffer (priv->context, + CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof (cl_float) * wsize * wsize, + coefficients, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + g_free (coefficients); +} + UfoNode * ufo_non_local_means_task_new (void) { @@ -74,6 +132,7 @@ ufo_non_local_means_task_setup (UfoTask *task, priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); + priv->window_mem = NULL; if (priv->kernel) UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error); @@ -95,7 +154,11 @@ ufo_non_local_means_task_get_requisition (UfoTask *task, UfoRequisition *requisition, GError **error) { + UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); ufo_buffer_get_requisition (inputs[0], requisition); + if (priv->use_window && !priv->window_mem) { + create_coefficients (priv); + } } static guint @@ -146,6 +209,7 @@ ufo_non_local_means_task_process (UfoTask *task, UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem)); profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); @@ -175,6 +239,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_SIGMA: priv->sigma = g_value_get_float (value); break; + case PROP_WINDOW: + priv->use_window = g_value_get_boolean (value); + break; case PROP_ADDRESSING_MODE: priv->addressing_mode = g_value_get_enum (value); break; @@ -205,6 +272,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_SIGMA: g_value_set_float (value, priv->sigma); break; + case PROP_WINDOW: + g_value_set_boolean (value, priv->use_window); + break; case PROP_ADDRESSING_MODE: g_value_set_enum (value, priv->addressing_mode); break; @@ -233,6 +303,7 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context)); priv->context = NULL; } + release_coefficients (priv); G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object); } @@ -285,6 +356,13 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) 0.0f, G_MAXFLOAT, 0.0f, G_PARAM_READWRITE); + properties[PROP_WINDOW] = + g_param_spec_boolean ("window", + "Use Gaussian window by computing patch weights", + "Use Gaussian window by computing patch weights", + TRUE, + G_PARAM_READWRITE); + properties[PROP_ADDRESSING_MODE] = g_param_spec_enum ("addressing-mode", "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", @@ -308,5 +386,6 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->patch_radius = 3; self->priv->h = 0.1f; self->priv->sigma = 0.0f; + self->priv->use_window = TRUE; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; } -- cgit v1.2.1 From da6a854a6759bbc2dc1c83bb23f5a2d154a68619 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Mon, 15 Jul 2019 15:36:34 +0200 Subject: NLM: improve memory access pattern --- src/kernels/nlm.cl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index 852c90c..0f1cbcd 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -39,8 +39,8 @@ dist (read_only image2d_t input, coeff /= wsize * wsize; } - for (int i = -radius; i < radius + 1; i++) { - for (int j = -radius; j < radius + 1; j++) { + for (int j = -radius; j < radius + 1; j++) { + for (int i = -radius; i < radius + 1; i++) { tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x - read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x; if (window_coeffs) { @@ -75,8 +75,8 @@ nlm_noise_reduction (read_only image2d_t input, float total_weight = 0.0f; float pixel_value = 0.0f; - for (int i = x - search_radius; i < x + search_radius + 1; i++) { - for (int j = y - search_radius; j < y + search_radius + 1; j++) { + for (int j = y - search_radius; j < y + search_radius + 1; j++) { + for (int i = x - search_radius; i < x + search_radius + 1; i++) { d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), patch_radius, width, height, h_2, variance, window_coeffs); weight = exp (-d); -- cgit v1.2.1 From 721b8f2db68f6c9cdd4955eb438d0376e41bd479 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Thu, 25 Jul 2019 12:37:31 +0200 Subject: NLM: rename function dist to compute_dist because of the name clash. --- src/kernels/nlm.cl | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index 0f1cbcd..603767c 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -18,16 +18,16 @@ */ float -dist (read_only image2d_t input, - sampler_t sampler, - float2 p, - float2 q, - int radius, - int width, - int height, - float h_2, - float variance, - constant float *window_coeffs) +compute_dist (read_only image2d_t input, + sampler_t sampler, + float2 p, + float2 q, + int radius, + int width, + int height, + float h_2, + float variance, + constant float *window_coeffs) { float dist = 0.0f, tmp; int wsize = (2 * radius + 1); @@ -77,7 +77,7 @@ nlm_noise_reduction (read_only image2d_t input, for (int j = y - search_radius; j < y + search_radius + 1; j++) { for (int i = x - search_radius; i < x + search_radius + 1; i++) { - d = dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), + d = compute_dist (input, sampler, (float2) (x + 0.5f, y + 0.5f), (float2) (i + 0.5f, j + 0.5f), patch_radius, width, height, h_2, variance, window_coeffs); weight = exp (-d); pixel_value += weight * read_imagef (input, sampler, (float2) ((i + 0.5f) / width, (j + 0.5f) / height)).x; -- cgit v1.2.1 From 692afa64b6e508653d091f641d8a85fc95f7733a Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Wed, 31 Jul 2019 13:28:35 +0200 Subject: NLM: Add fast version of the algorithm --- docs/filters.rst | 13 ++ src/kernels/cumsum.cl | 125 +++++++++++ src/kernels/meson.build | 1 + src/kernels/nlm.cl | 82 ++++++++ src/ufo-non-local-means-task.c | 463 +++++++++++++++++++++++++++++++++++++++-- 5 files changed, 668 insertions(+), 16 deletions(-) create mode 100644 src/kernels/cumsum.cl diff --git a/docs/filters.rst b/docs/filters.rst index d0ef056..2abe2ea 100644 --- a/docs/filters.rst +++ b/docs/filters.rst @@ -555,11 +555,24 @@ Non-local-means denoising which decreases the influence of pixels towards the corners of the patches. + .. gobj:prop:: fast:boolean + + Use a fast version of the algorithm described in [#]_. The only + difference in the result from the classical algorithm is that there is + no Gaussian profile used and from the nature of the fast algorithm, + floating point precision errors might occur for large images. + .. gobj:prop:: addressing-mode:enum Addressing mode specifies the behavior for pixels falling outside the original image. See OpenCL ``sampler_t`` documentation for more information. + .. [#] + J. Darbon, A. Cunha, T.F. Chan, S. Osher, and G.J. Jensen, *Fast + nonlocal filtering applied to electron cryomicroscopy* in 5th IEEE + International Symposium on Biomedical Imaging: From Nano to Macro, + 2008, pp. 1331-1334. DOI:10.1109/ISBI.2008.4541250 + Horizontal interpolation ------------------------ diff --git a/src/kernels/cumsum.cl b/src/kernels/cumsum.cl new file mode 100644 index 0000000..4d88b93 --- /dev/null +++ b/src/kernels/cumsum.cl @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2011-2019 Karlsruhe Institute of Technology + * + * This file is part of Ufo. + * + * This library is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ +/* There are 16 memory banks, so divide n by their number, which offsets every + * thread by n / 16 */ +#define COMPUTE_SHIFTED_INDEX(n) ((n) + ((n) >> 4)) + +void sum_chunk_no_conflicts (global float *input, + global float *output, + local float *cache, + const int height_offset, + const int group_offset, + const int width, + const int local_width, + const int group_iterations, + const int gx, + const int lx) +{ + float tmp; + int d, ai, bi, offset = 1; + int tx_0 = height_offset + 2 * local_width * gx * group_iterations + 2 * lx + group_offset; + + if (tx_0 - height_offset < width) { + cache[COMPUTE_SHIFTED_INDEX (2 * lx)] = input[tx_0]; + } + if (tx_0 + 1 - height_offset < width) { + cache[COMPUTE_SHIFTED_INDEX (2 * lx + 1)] = input[tx_0 + 1]; + } + barrier (CLK_LOCAL_MEM_FENCE); + + // build sum in place up the tree + for (d = local_width; d > 0; d >>= 1) { + if (lx < d) { + ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1); + bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1); + cache[bi] += cache[ai]; + } + offset <<= 1; + barrier (CLK_LOCAL_MEM_FENCE); + } + + if (lx == local_width - 1) { + tmp = group_offset == 0 ? 0 : output[height_offset + 2 * local_width * gx * group_iterations + group_offset - 1]; + cache[COMPUTE_SHIFTED_INDEX (2 * local_width)] = cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] + tmp; + cache[COMPUTE_SHIFTED_INDEX (2 * local_width - 1)] = tmp; + } + + // traverse down tree & build scan + for (d = 1; d <= local_width; d <<= 1) { + offset >>= 1; + barrier (CLK_LOCAL_MEM_FENCE); + if (lx < d) { + ai = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 1) - 1); + bi = COMPUTE_SHIFTED_INDEX (offset * (2 * lx + 2) - 1); + tmp = cache[ai]; + cache[ai] = cache[bi]; + cache[bi] += tmp; + } + } + barrier (CLK_LOCAL_MEM_FENCE); + + if (tx_0 - height_offset < width) { + output[tx_0] = cache[bi]; + } + if (tx_0 + 1 - height_offset < width) { + output[tx_0 + 1] = cache[COMPUTE_SHIFTED_INDEX (2 * lx + 2)]; + } +} + +kernel void cumsum (global float *input, + global float *output, + global float *block_sums, + local float *cache, + const int group_iterations, + const int width) +{ + int local_width = get_local_size (0); + int gx = get_group_id (0); + int lx = get_local_id (0); + int idy = get_global_id (1); + int height_offset = idy * width; + + for (int group_offset = 0; group_offset < 2 * local_width * group_iterations; group_offset += 2 * local_width) { + sum_chunk_no_conflicts (input, output, cache, height_offset, group_offset, width, local_width, group_iterations, gx, lx); + barrier (CLK_GLOBAL_MEM_FENCE); + } + if (block_sums && lx == 0) { + block_sums[gx + 2 * local_width * idy] = output[height_offset + + min(width - 1, 2 * local_width * (gx + 1) * group_iterations - 1)]; + } +} + +kernel void spread_block_sums (global float *output, + global float *block_sums, + const int group_iterations, + const int width) +{ + int local_width = get_local_size (0); + int gx = get_group_id (0); + int lx = get_local_id (0); + int tx = local_width * ((gx + 1) * group_iterations) + lx; + int idy = get_global_id (1); + + for (int group_offset = 0; group_offset < local_width * group_iterations; group_offset += local_width) { + if (tx + group_offset >= width) { + break; + } + output[idy * width + tx + group_offset] += block_sums[gx + local_width * idy]; + } +} diff --git a/src/kernels/meson.build b/src/kernels/meson.build index 7d33d77..03cdac5 100644 --- a/src/kernels/meson.build +++ b/src/kernels/meson.build @@ -9,6 +9,7 @@ kernel_files = [ 'correlate.cl', 'cut.cl', 'cut-sinogram.cl', + 'cumsum.cl', 'denoise.cl', 'dfi.cl', 'edge.cl', diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index 603767c..d2f6b0f 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -56,6 +56,88 @@ compute_dist (read_only image2d_t input, return dist * coeff; } +kernel void +compute_shifted_mse (read_only image2d_t input, + global float *output, + sampler_t sampler, + const int dx, + const int dy, + const int padding, + const float variance) +{ + /* Global dimensions are for the padded output image */ + int idx = get_global_id (0); + int idy = get_global_id (1); + int padded_width = get_global_size (0); + /* Image is padded by *padding* on both sides of both dimensions. */ + float cropped_width = (float) (padded_width - 2 * padding); + float cropped_height = (float) (get_global_size (1) - 2 * padding); + float x = ((float) (idx - padding)) + 0.5f; + float y = ((float) (idy - padding)) + 0.5f; + float a, b; + + a = read_imagef (input, sampler, (float2) (x / cropped_width, y / cropped_height)).x; + b = read_imagef (input, sampler, (float2) ((x + dx) / cropped_width, (y + dy) / cropped_height)).x; + + output[idy * padded_width + idx] = (a - b) * (a - b) - 2 * variance; +} + +kernel void +process_shift (read_only image2d_t input, + global float *integral_input, + global float *weights, + global float *output, + const sampler_t sampler, + const int dx, + const int dy, + const int patch_radius, + const int padding, + const float coeff, + const int is_conjugate) +{ + /* Global dimensions are for the cropped output image */ + int idx = get_global_id (0); + int idy = get_global_id (1); + /* Image is padded by *padding* on both sides of both dimensions. */ + int x = idx + padding; + int y = idy + padding; + float x_im = idx + dx + 0.5f; + float y_im = idy + dy + 0.5f; + int cropped_width = get_global_size (0); + int cropped_height = get_global_size (1); + int padded_width = cropped_width + 2 * padding; + float dist, weight; + if (is_conjugate) { + /* direct computation: g(x) = w(x) + f(x + dx) + * conjugate: g(x + dx) = w(x) + f(x), where idx = x + dx, so + * g(idx) = w(idx - dx) * f(idx - x) = w(idx - dx) * f(x_im - 2dx) + */ + x -= dx; + y -= dy; + x_im -= 2 * dx; + y_im -= 2 * dy; + } + + dist = integral_input[(y + patch_radius) * padded_width + x + patch_radius] - + integral_input[(y - patch_radius - 1) * padded_width + x + patch_radius] - + integral_input[(y + patch_radius) * padded_width + x - patch_radius - 1] + + integral_input[(y - patch_radius - 1) * padded_width + x - patch_radius - 1]; + dist = fmax (0.0f, dist) * coeff; + weight = exp (-dist); + + weights[idy * cropped_width + idx] += weight; + output[idy * cropped_width + idx] += weight * read_imagef (input, sampler, (float2) (x_im / cropped_width, + y_im / cropped_height)).x; +} + +kernel void divide_inplace (global float *coefficients, + global float *output) +{ + int index = get_global_size (0) * get_global_id (1) + get_global_id (0); + + output[index] = native_divide (output[index], coefficients[index]); +} + kernel void nlm_noise_reduction (read_only image2d_t input, global float *output, diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index ee9f2ad..4c14c67 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -27,17 +27,22 @@ #include "ufo-non-local-means-task.h" #include "common/ufo-addressing.h" +#define PIXELS_PER_THREAD 4 +#define NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1) struct _UfoNonLocalMeansTaskPrivate { guint search_radius; guint patch_radius; + gsize max_work_group_size, cropped_size[2], padded_size[2]; + gint padding; gfloat h; gfloat sigma; gboolean use_window; - cl_kernel kernel; + gboolean fast; + cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel; cl_sampler sampler; cl_context context; - cl_mem window_mem; + cl_mem window_mem, group_sums, aux_mem[2], weights_mem; AddressingMode addressing_mode; }; @@ -56,12 +61,36 @@ enum { PROP_H, PROP_SIGMA, PROP_WINDOW, + PROP_FAST, PROP_ADDRESSING_MODE, N_PROPERTIES }; static GParamSpec *properties[N_PROPERTIES] = { NULL, }; +static gint +compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) +{ + gdouble integer; + gint local_width; + + /* Compute global and local dimensions for the cumsum kernel */ + /* First make sure local_width is a power of 2 */ + modf (log2 (priv->max_work_group_size), &integer); + local_width = (gint) pow (2, integer); + if (local_width > 4) { + /* Empirically determined value on NVIDIA cards */ + local_width /= 4; + } + /* *local_width* is the minimum of the power-of-two-padded minimum of + * (width, height) and the desired local width. + */ + modf (log2 (MIN (priv->padded_size[0], priv->padded_size[1]) - 1.0f), &integer); + local_width = (gsize) MIN (local_width, pow (2, 1 + ((gint) integer))); + + return local_width; +} + /** * release_coefficients: * @priv: UfoNonLocalMeansTaskPrivate @@ -116,6 +145,292 @@ create_coefficients (UfoNonLocalMeansTaskPrivate *priv) g_free (coefficients); } +static void +release_fast_buffers (UfoNonLocalMeansTaskPrivate *priv) +{ + if (priv->group_sums) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->group_sums)); + priv->group_sums = NULL; + } + if (priv->weights_mem) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->weights_mem)); + priv->weights_mem = NULL; + } + for (gint i = 0; i < 2; i++) { + if (priv->aux_mem[i]) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->aux_mem[i])); + priv->aux_mem[i] = NULL; + } + } +} + +static void +create_fast_buffers (UfoNonLocalMeansTaskPrivate *priv) +{ + cl_int err; + gint local_width = compute_cumsum_local_width (priv); + + priv->group_sums = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * local_width * MAX (priv->padded_size[0], priv->padded_size[1]), + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + + priv->weights_mem = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * priv->cropped_size[0] * priv->cropped_size[1], + NULL, + &err); + for (gint i = 0; i < 2; i++) { + priv->aux_mem[i] = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * priv->padded_size[0] * priv->padded_size[1], + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + } + UFO_RESOURCES_CHECK_CLERR (err); +} + +static void +transpose (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint width, + const gint height) +{ + gsize global_size[2]; + gsize local_size[2] = {32, 32 / PIXELS_PER_THREAD}; + static gint iteration_number = 0; + + while (local_size[0] * local_size[1] > priv->max_work_group_size) { + local_size[0] /= 2; + local_size[1] /= 2; + } + global_size[0] = ((width - 1) / local_size[0] + 1) * local_size[0]; + global_size[1] = ((height - 1) / local_size[1] / PIXELS_PER_THREAD + 1) * local_size[1]; + if (!iteration_number) { + g_debug ("Image size: %d x %d", width, height); + g_debug ("Transpose global work group size: %lu x %lu", global_size[0], global_size[1]); + g_debug ("Transpose local work group size: %lu x %lu", local_size[0], local_size[1]); + iteration_number++; + } + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 2, + (local_size[0] + 1) * local_size[1] * PIXELS_PER_THREAD * sizeof (cl_float), NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 3, sizeof (cl_int), &width)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 4, sizeof (cl_int), &height)); + ufo_profiler_call (profiler, cmd_queue, priv->transpose_kernel, 2, global_size, local_size); +} + +static void +compute_cumsum (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint width, + const gint height) +{ + gint local_width, num_groups, num_group_iterations, one = 1; + gsize cumsum_global_size[2], block_sums_global_size[2], spread_global_size[2], + local_size[2], spread_local_size[2]; + gsize cache_size; + static gint iteration_number = 0; + + local_width = compute_cumsum_local_width (priv); + /* Number of groups we need to process *width* pixels. If it is more than + * *local_width* then use only that number and every group will process more + * successive blocks given by group size. This is necessary in order to + * avoid recursion in the spreading phase of the group sums. The local cache + * memory is limited to *local_width*, every group stores its sum into an + * auxiliary buffer, which is then also summed (only one iteration needed, + * because there is only one group in the auxiliary buffer since we limit + * the number of groups to *local_width*. + * This is not be the final number of groups, it's just used to compute the + * number of iterations of every group. + */ + num_groups = MIN (local_width, NUM_CHUNKS (width, local_width)); + /* Number of iterations of every group is given by the number of pixels + * divided by the number of pixels *num_groups* can process. */ + num_group_iterations = NUM_CHUNKS (width, local_width * num_groups); + /* Finally, the real number of groups is given by the number of pixels + * divided by the group size and the number of group iterations. */ + num_groups = NUM_CHUNKS (width, num_group_iterations * local_width); + + /* Cache size must be larger by *local_size* / 16 because of the bank + * conflicts avoidance. Additionally, +1 is needed because of the shifted + * access to the local memory. + */ + cache_size = sizeof (cl_float) * (local_width + NUM_CHUNKS (local_width, 16) + 1); + cumsum_global_size[0] = num_groups * local_width / 2; + cumsum_global_size[1] = height; + block_sums_global_size[0] = local_width / 2; + block_sums_global_size[1] = height; + spread_global_size[0] = (num_groups - 1) * local_width; + spread_global_size[1] = height; + local_size[0] = local_width / 2; + local_size[1] = 1; + spread_local_size[0] = local_width; + spread_local_size[1] = 1; + if (!iteration_number) { + g_debug (" width: %d", width); + g_debug (" local width: %d", local_width); + g_debug (" num groups: %d", num_groups); + g_debug ("group iterations: %d", num_group_iterations); + g_debug (" kernel dims: %lu %lu %lu %lu", cumsum_global_size[0], cumsum_global_size[1], local_size[0], local_size[1]); + g_debug (" cache size: %lu", cache_size); + iteration_number++; + } + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &num_group_iterations)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &width)); + ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, cumsum_global_size, local_size); + + if (num_groups > 1) { + /* If there are more than one group, we need to spread the partial sums + * of the groups to successive groups. First compute the sums of the + * groups and then spread them to respective pixels in them. Because of + * our choice of number of iterations per group above, the partial sums + * have to be summed only once without recursion. + */ + /* First sum the partial sums. */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &one)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &local_width)); + ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, block_sums_global_size, local_size); + + /* Spread them across all pixels. */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 0, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 1, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 2, sizeof (gint), &num_group_iterations)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 3, sizeof (gint), &width)); + ufo_profiler_call (profiler, cmd_queue, priv->spread_kernel, 2, spread_global_size, spread_local_size); + } +} + +static void +compute_sdx (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint dx, + const gint dy) +{ + gfloat var = priv->sigma * priv->sigma; + + /* First compute the shifted MSE */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 1, sizeof (cl_mem), &priv->aux_mem[0])); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 2, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 3, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 4, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 5, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 6, sizeof (gint), &var)); + ufo_profiler_call (profiler, cmd_queue, priv->mse_kernel, 2, priv->padded_size, NULL); + + /* Horizontal cumsum and transposition */ + compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[0], priv->padded_size[1]); + transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0], + priv->padded_size[0], priv->padded_size[1]); + + /* 2D cumsum is separable, so compute the horizontal cumsum of the transposed horizontally cumsummed image */ + compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[1], priv->padded_size[0]); + transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0], + priv->padded_size[1], priv->padded_size[0]); +} + +static void +process_shift (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem input_image, + cl_mem integral_mem, + cl_mem out_mem, + const gint dx, + const gint dy) +{ + gint patch_size = 2 * priv->patch_radius + 1; + gfloat coeff = 1.0f / (priv->h * priv->h * patch_size * patch_size); + gint is_conjugate = 0; + + /* Compute r(x) = w(x) * f(x + dx) */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate)); + ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL); + + if (dx != 0 || dy != 0) { + /* Compute r(x + dx) = w(x + dx) * f(x), which cannot be computed at + * once in the above kernel call because one work item would write to + * two global memory locations, thus creating a race condition. */ + is_conjugate = 1; + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate)); + ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL); + } +} + +static void +compute_nlm_fast (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem) +{ + gint dx, dy, sr; + sr = (gint) priv->search_radius; + + /* Every pixel computes its own result and spreads it to the pixel y + dy, + * thus we start at dy = 0 every pixel gets its dy < 0 value from the pixel + * y - dy. For dy = 0, compute only pixels to the right, so the negative + * values are obtained from the pixels to the left (analogous to the dy + * case). + */ + for (dy = 0; dy < sr + 1; dy++) { + for (dx = dy == 0 ? 0 : -sr; dx < sr + 1; dx++) { + compute_sdx (priv, cmd_queue, profiler, in_mem, out_mem, dx, dy); + process_shift (priv, cmd_queue, profiler, in_mem, priv->aux_mem[0], out_mem, dx, dy); + } + } + /* Now we have the sum of results and weights, divide them to get the + * result. + */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 0, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 1, sizeof (cl_mem), &out_mem)); + ufo_profiler_call (profiler, cmd_queue, priv->divide_kernel, 2, priv->cropped_size, NULL); +} + UfoNode * ufo_non_local_means_task_new (void) { @@ -131,11 +446,44 @@ ufo_non_local_means_task_setup (UfoTask *task, cl_int err; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); - priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); + if (priv->fast) { + priv->mse_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "compute_shifted_mse", NULL, error); + priv->cumsum_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "cumsum", NULL, error); + priv->spread_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "spread_block_sums", NULL, error); + priv->transpose_kernel = ufo_resources_get_kernel (resources, "transpose.cl", "transpose_shared", NULL, error); + priv->weight_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "process_shift", NULL, error); + priv->divide_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "divide_inplace", NULL, error); + } else { + priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); + } priv->window_mem = NULL; + priv->group_sums = NULL; + priv->weights_mem = NULL; + for (gint i = 0; i < 2; i++) { + priv->aux_mem[i] = NULL; + } - if (priv->kernel) + if (priv->kernel) { UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error); + } + if (priv->mse_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->mse_kernel), error); + } + if (priv->cumsum_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->cumsum_kernel), error); + } + if (priv->spread_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->spread_kernel), error); + } + if (priv->transpose_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->transpose_kernel), error); + } + if (priv->weight_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->weight_kernel), error); + } + if (priv->divide_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error); + } priv->context = ufo_resources_get_context (resources); UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error); @@ -155,10 +503,35 @@ ufo_non_local_means_task_get_requisition (UfoTask *task, GError **error) { UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); + UfoGpuNode *node; + GValue *max_work_group_size_gvalue; + ufo_buffer_get_requisition (inputs[0], requisition); + if (priv->use_window && !priv->window_mem) { create_coefficients (priv); } + if (priv->fast) { + if (priv->max_work_group_size == 0) { + node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); + max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE); + priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue); + g_value_unset (max_work_group_size_gvalue); + } + if (priv->cropped_size[0] == 0 || priv->cropped_size[0] != requisition->dims[0] || + priv->cropped_size[1] != requisition->dims[1]) { + if (priv->cropped_size[0] != 0) { + /* Size changed, release first. */ + release_fast_buffers (priv); + } + priv->cropped_size[0] = requisition->dims[0]; + priv->cropped_size[1] = requisition->dims[1]; + priv->padding = priv->search_radius + priv->patch_radius + 1; + priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding; + priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding; + create_fast_buffers (priv); + } + } } static guint @@ -189,30 +562,43 @@ ufo_non_local_means_task_process (UfoTask *task, UfoNonLocalMeansTaskPrivate *priv; UfoGpuNode *node; UfoProfiler *profiler; + UfoRequisition in_req; cl_command_queue cmd_queue; cl_mem in_mem; cl_mem out_mem; gfloat h, var; + gfloat fill_pattern = 0.0f; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); - in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); + profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); + ufo_buffer_get_requisition (inputs[0], &in_req); h = 1 / priv->h / priv->h; var = priv->sigma * priv->sigma; - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem)); - - profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); - ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); + if (priv->fast) { + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); + UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0, + sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1], + 0, NULL, NULL)); + UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, priv->weights_mem, &fill_pattern, sizeof (gfloat), 0, + sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1], + 0, NULL, NULL)); + compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem); + } else { + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem)); + ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); + } return TRUE; } @@ -242,6 +628,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_WINDOW: priv->use_window = g_value_get_boolean (value); break; + case PROP_FAST: + priv->fast = g_value_get_boolean (value); + break; case PROP_ADDRESSING_MODE: priv->addressing_mode = g_value_get_enum (value); break; @@ -275,6 +664,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_WINDOW: g_value_set_boolean (value, priv->use_window); break; + case PROP_FAST: + g_value_set_boolean (value, priv->fast); + break; case PROP_ADDRESSING_MODE: g_value_set_enum (value, priv->addressing_mode); break; @@ -295,6 +687,30 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel)); priv->kernel = NULL; } + if (priv->mse_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->mse_kernel)); + priv->mse_kernel = NULL; + } + if (priv->cumsum_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->cumsum_kernel)); + priv->cumsum_kernel = NULL; + } + if (priv->spread_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->spread_kernel)); + priv->spread_kernel = NULL; + } + if (priv->transpose_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->transpose_kernel)); + priv->transpose_kernel = NULL; + } + if (priv->weight_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->weight_kernel)); + priv->weight_kernel = NULL; + } + if (priv->divide_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel)); + priv->divide_kernel = NULL; + } if (priv->sampler) { UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler)); priv->sampler = NULL; @@ -304,6 +720,7 @@ ufo_non_local_means_task_finalize (GObject *object) priv->context = NULL; } release_coefficients (priv); + release_fast_buffers (priv); G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object); } @@ -363,6 +780,13 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) TRUE, G_PARAM_READWRITE); + properties[PROP_FAST] = + g_param_spec_boolean ("fast", + "Use a faster version of the algorithm", + "Use a faster version of the algorithm", + TRUE, + G_PARAM_READWRITE); + properties[PROP_ADDRESSING_MODE] = g_param_spec_enum ("addressing-mode", "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", @@ -384,8 +808,15 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->search_radius = 10; self->priv->patch_radius = 3; + self->priv->cropped_size[0] = 0; + self->priv->cropped_size[1] = 0; + self->priv->padded_size[0] = 0; + self->priv->padded_size[1] = 0; + self->priv->padding = -1; self->priv->h = 0.1f; self->priv->sigma = 0.0f; + self->priv->max_work_group_size = 0; self->priv->use_window = TRUE; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; + self->priv->fast = FALSE; } -- cgit v1.2.1 From 2009b00e1d086e6ccef2d3fab84da2eba7b41c60 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Wed, 31 Jul 2019 17:03:46 +0200 Subject: NLM: Add estimate-sigma --- docs/filters.rst | 22 +++++- src/kernels/estimate-noise.cl | 42 +++++++++++ src/kernels/meson.build | 1 + src/ufo-non-local-means-task.c | 160 +++++++++++++++++++++++++++++++++++------ 4 files changed, 202 insertions(+), 23 deletions(-) create mode 100644 src/kernels/estimate-noise.cl diff --git a/docs/filters.rst b/docs/filters.rst index 2abe2ea..8afd0cb 100644 --- a/docs/filters.rst +++ b/docs/filters.rst @@ -542,16 +542,20 @@ Non-local-means denoising Smoothing control parameter, should be around noise standard deviation or slightly less. Higher h results in a smoother image but with blurred - features. + features. If it is 0, estimate noise standard deviation and use it as + the parameter value. .. gobj:prop:: sigma:float - Noise standard deviation, improves weights computation. + Noise standard deviation, improves weights computation. If it is zero, + it is *not* automatically estimated as opposed to :gobj:prop:`h`. + :gobj:prop:`estimate-sigma` has to be specified in order to override + :gobj:prop:`sigma` value. .. gobj:prop:: window:boolean Apply Gaussian profile with :math:`\sigma = \frac{P}{2}`, where :math:`P` - is the gobj:prop:`patch-radius` parameter to the weight computation + is the :gobj:prop:`patch-radius` parameter to the weight computation which decreases the influence of pixels towards the corners of the patches. @@ -562,6 +566,13 @@ Non-local-means denoising no Gaussian profile used and from the nature of the fast algorithm, floating point precision errors might occur for large images. + .. gobj:prop:: estimate-sigma:boolean + + Estimate sigma based on [#]_, which overrides :gobj:prop:`sigma` + parameter value. Only the first image in a sequence is used for + estimation and the estimated sigma is re-used for every consequent + image. + .. gobj:prop:: addressing-mode:enum Addressing mode specifies the behavior for pixels falling outside the @@ -573,6 +584,11 @@ Non-local-means denoising International Symposium on Biomedical Imaging: From Nano to Macro, 2008, pp. 1331-1334. DOI:10.1109/ISBI.2008.4541250 + .. [#] + J. Immerkaer, *Fast noise variance estimation* in Computer vision and image + understanding 64.2 (1996): 300-302. DOI:10.1006/cviu.1996.0060 + + Horizontal interpolation ------------------------ diff --git a/src/kernels/estimate-noise.cl b/src/kernels/estimate-noise.cl new file mode 100644 index 0000000..31815d3 --- /dev/null +++ b/src/kernels/estimate-noise.cl @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2011-2019 Karlsruhe Institute of Technology + * + * This file is part of Ufo. + * + * This library is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ + +kernel void +convolve_abs_laplacian_diff (read_only image2d_t input, + const sampler_t sampler, + global float *output) +{ + int idx = get_global_id (0); + int idy = get_global_id (1); + int width = get_global_size (0); + int height = get_global_size (1); + float sum = 0.0f; + float coeffs[3][3] = {{ 1.0f, -2.0f, 1.0f}, + {-2.0f, 4.0f, -2.0f}, + { 1.0f, -2.0f, 1.0f}}; + + for (int dy = -1; dy < 2; dy++) { + for (int dx = -1; dx < 2; dx++) { + sum += coeffs[dy + 1][dx + 1] * read_imagef (input, sampler, (float2) ((idx + dx + 0.5f) / width, + (idy + dy + 0.5f) / height)).x; + } + } + + output[idy * width + idx] = fabs (sum); +} diff --git a/src/kernels/meson.build b/src/kernels/meson.build index 03cdac5..4126c27 100644 --- a/src/kernels/meson.build +++ b/src/kernels/meson.build @@ -13,6 +13,7 @@ kernel_files = [ 'denoise.cl', 'dfi.cl', 'edge.cl', + 'estimate-noise.cl', 'ffc.cl', 'fft.cl', 'fftmult.cl', diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 4c14c67..5ca2e2e 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -39,7 +39,9 @@ struct _UfoNonLocalMeansTaskPrivate { gfloat sigma; gboolean use_window; gboolean fast; - cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel; + gboolean estimate_sigma; + cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel, + convolution_kernel, sum_kernel; cl_sampler sampler; cl_context context; cl_mem window_mem, group_sums, aux_mem[2], weights_mem; @@ -62,6 +64,7 @@ enum { PROP_SIGMA, PROP_WINDOW, PROP_FAST, + PROP_ESTIMATE_SIGMA, PROP_ADDRESSING_MODE, N_PROPERTIES }; @@ -456,6 +459,9 @@ ufo_non_local_means_task_setup (UfoTask *task, } else { priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); } + priv->convolution_kernel = ufo_resources_get_kernel (resources, "estimate-noise.cl", "convolve_abs_laplacian_diff", NULL, error); + priv->sum_kernel = ufo_resources_get_kernel (resources, "reductor.cl", "reduce_M_SUM", NULL, error); + priv->window_mem = NULL; priv->group_sums = NULL; priv->weights_mem = NULL; @@ -484,6 +490,12 @@ ufo_non_local_means_task_setup (UfoTask *task, if (priv->divide_kernel) { UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error); } + if (priv->convolution_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->convolution_kernel), error); + } + if (priv->sum_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->sum_kernel), error); + } priv->context = ufo_resources_get_context (resources); UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error); @@ -508,27 +520,26 @@ ufo_non_local_means_task_get_requisition (UfoTask *task, ufo_buffer_get_requisition (inputs[0], requisition); + if (priv->max_work_group_size == 0) { + node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); + max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE); + priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue); + g_value_unset (max_work_group_size_gvalue); + } if (priv->use_window && !priv->window_mem) { create_coefficients (priv); } - if (priv->fast) { - if (priv->max_work_group_size == 0) { - node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); - max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE); - priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue); - g_value_unset (max_work_group_size_gvalue); - } - if (priv->cropped_size[0] == 0 || priv->cropped_size[0] != requisition->dims[0] || - priv->cropped_size[1] != requisition->dims[1]) { - if (priv->cropped_size[0] != 0) { - /* Size changed, release first. */ - release_fast_buffers (priv); - } - priv->cropped_size[0] = requisition->dims[0]; - priv->cropped_size[1] = requisition->dims[1]; + if (priv->cropped_size[0] != requisition->dims[0] || priv->cropped_size[1] != requisition->dims[1]) { + priv->cropped_size[0] = requisition->dims[0]; + priv->cropped_size[1] = requisition->dims[1]; + if (priv->fast) { priv->padding = priv->search_radius + priv->patch_radius + 1; priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding; priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding; + if (priv->group_sums) { + /* Buffers exist, which means size changed, release first. */ + release_fast_buffers (priv); + } create_fast_buffers (priv); } } @@ -553,6 +564,75 @@ ufo_non_local_means_task_get_mode (UfoTask *task) return UFO_TASK_MODE_PROCESSOR | UFO_TASK_MODE_GPU; } +static gfloat +compute_sigma (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem input_image, + cl_mem out_mem) +{ + gsize n = priv->cropped_size[0] * priv->cropped_size[1]; + gsize num_groups = MIN (priv->max_work_group_size, NUM_CHUNKS (n, priv->max_work_group_size)); + gsize num_group_iterations, global_size; + gfloat *result, sum = 0.0f; + cl_int err; + cl_mem group_sums; + + /* First compute the convolution of the input with the difference of + * laplacians. + */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 1, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 2, sizeof (cl_mem), &out_mem)); + ufo_profiler_call (profiler, cmd_queue, priv->convolution_kernel, 2, priv->cropped_size, NULL); + + /* Now compute partial sums of the convolved image. */ + /* Number of iterations of every group is given by the number of pixels + * divided by the number of pixels *num_groups* can process. */ + num_group_iterations = NUM_CHUNKS (n, priv->max_work_group_size * num_groups); + /* The real number of groups is given by the number of pixels + * divided by the group size and the number of group iterations. */ + num_groups = NUM_CHUNKS (n, num_group_iterations * priv->max_work_group_size); + global_size = num_groups * priv->max_work_group_size; + + g_debug (" n: %lu", n); + g_debug (" num groups: %lu", num_groups); + g_debug ("group iterations: %lu", num_group_iterations); + g_debug (" kernel dims: %lu", global_size); + + result = g_malloc0 (sizeof (cl_float) * num_groups); + group_sums = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * num_groups, + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 0, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 1, sizeof (cl_mem), &group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 2, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 3, sizeof (cl_float) * priv->max_work_group_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 4, sizeof (gsize), &n)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 5, sizeof (gint), &num_group_iterations)); + ufo_profiler_call (profiler, cmd_queue, priv->sum_kernel, 1, &global_size, &priv->max_work_group_size); + + clEnqueueReadBuffer (cmd_queue, + group_sums, + CL_TRUE, + 0, sizeof (cl_float) * num_groups, + result, + 0, NULL, NULL); + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (group_sums)); + + /* Sum partial sums computed by the groups. */ + for (gsize i = 0; i < num_groups; i++) { + sum += result[i]; + } + g_free (result); + + return sqrt (G_PI_2) / (6 * (priv->cropped_size[0] - 2.0f) * (priv->cropped_size[1] - 2.0f)) * sum; +} + static gboolean ufo_non_local_means_task_process (UfoTask *task, UfoBuffer **inputs, @@ -566,20 +646,39 @@ ufo_non_local_means_task_process (UfoTask *task, cl_command_queue cmd_queue; cl_mem in_mem; cl_mem out_mem; - gfloat h, var; + gfloat h, var, estimated_sigma; gfloat fill_pattern = 0.0f; + guint num_processed; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); ufo_buffer_get_requisition (inputs[0], &in_req); + g_object_get (task, "num_processed", &num_processed, NULL); + + /* Estimate sigma only from the first image in the stream in order to + * keep the results consistent. + */ + if (num_processed == 0 && (priv->h <= 0.0f || priv->estimate_sigma)) { + /* Use out_mem for the convolution, it's not necessary after the + * computation and can be re-used by the de-noising itself. + */ + estimated_sigma = compute_sigma (priv, cmd_queue, profiler, in_mem, out_mem); + g_debug ("Estimated sigma: %g", estimated_sigma); + if (priv->h <= 0.0f) { + priv->h = estimated_sigma; + } + if (priv->estimate_sigma) { + priv->sigma = estimated_sigma; + } + } h = 1 / priv->h / priv->h; var = priv->sigma * priv->sigma; if (priv->fast) { - in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0, sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1], 0, NULL, NULL)); @@ -588,7 +687,6 @@ ufo_non_local_means_task_process (UfoTask *task, 0, NULL, NULL)); compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem); } else { - in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); @@ -631,6 +729,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_FAST: priv->fast = g_value_get_boolean (value); break; + case PROP_ESTIMATE_SIGMA: + priv->estimate_sigma = g_value_get_boolean (value); + break; case PROP_ADDRESSING_MODE: priv->addressing_mode = g_value_get_enum (value); break; @@ -667,6 +768,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_FAST: g_value_set_boolean (value, priv->fast); break; + case PROP_ESTIMATE_SIGMA: + g_value_set_boolean (value, priv->estimate_sigma); + break; case PROP_ADDRESSING_MODE: g_value_set_enum (value, priv->addressing_mode); break; @@ -711,6 +815,14 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel)); priv->divide_kernel = NULL; } + if (priv->convolution_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->convolution_kernel)); + priv->convolution_kernel = NULL; + } + if (priv->sum_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->sum_kernel)); + priv->sum_kernel = NULL; + } if (priv->sampler) { UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler)); priv->sampler = NULL; @@ -787,6 +899,13 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) TRUE, G_PARAM_READWRITE); + properties[PROP_ESTIMATE_SIGMA] = + g_param_spec_boolean ("estimate-sigma", + "Estimate noise standard deviation", + "Estimate noise standard deviation", + FALSE, + G_PARAM_READWRITE); + properties[PROP_ADDRESSING_MODE] = g_param_spec_enum ("addressing-mode", "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", @@ -813,10 +932,11 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->padded_size[0] = 0; self->priv->padded_size[1] = 0; self->priv->padding = -1; - self->priv->h = 0.1f; + self->priv->h = 0.0f; self->priv->sigma = 0.0f; self->priv->max_work_group_size = 0; self->priv->use_window = TRUE; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; self->priv->fast = FALSE; + self->priv->estimate_sigma = FALSE; } -- cgit v1.2.1 From b0166f3f68323add5d405b10b8468441b8ab0df9 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Wed, 31 Jul 2019 17:54:31 +0200 Subject: NLM: use fmax (0.0f, value) only at the end --- src/kernels/nlm.cl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/kernels/nlm.cl b/src/kernels/nlm.cl index d2f6b0f..a094f34 100644 --- a/src/kernels/nlm.cl +++ b/src/kernels/nlm.cl @@ -44,16 +44,14 @@ compute_dist (read_only image2d_t input, tmp = read_imagef (input, sampler, (float2) ((p.x + i) / width, (p.y + j) / height)).x - read_imagef (input, sampler, (float2) ((q.x + i) / width, (q.y + j) / height)).x; if (window_coeffs) { - /* Use gaussian window. - * Cutoff negative numbers which would cause large weights. */ - dist += fmax (0.0f, window_coeffs[(j + radius) * wsize + (i + radius)] * (tmp * tmp - 2 * variance)); + dist += window_coeffs[(j + radius) * wsize + (i + radius)] * (tmp * tmp - 2 * variance); } else { - dist += fmax (0.0f, tmp * tmp - 2 * variance); + dist += tmp * tmp - 2 * variance; } } } - return dist * coeff; + return fmax (0.0f, dist) * coeff; } kernel void -- cgit v1.2.1 From c5c4855c86fa39f1d65ff8fffe96b6beb772814a Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Wed, 31 Jul 2019 18:00:15 +0200 Subject: NLM: clean up Gauss window creation and deletion --- src/ufo-non-local-means-task.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 5ca2e2e..86857d2 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -94,14 +94,8 @@ compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) return local_width; } -/** - * release_coefficients: - * @priv: UfoNonLocalMeansTaskPrivate - * - * Release Gaussian window coefficients memory object. - */ static void -release_coefficients (UfoNonLocalMeansTaskPrivate *priv) +release_gaussian_window (UfoNonLocalMeansTaskPrivate *priv) { if (priv->window_mem) { UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->window_mem)); @@ -109,14 +103,8 @@ release_coefficients (UfoNonLocalMeansTaskPrivate *priv) } } -/** - * create_coefficients: - * @priv: UfoNonLocalMeansTaskPrivate - * - * Compute Gaussian window coefficients. - */ static void -create_coefficients (UfoNonLocalMeansTaskPrivate *priv) +create_gaussian_window (UfoNonLocalMeansTaskPrivate *priv) { cl_int err; gfloat *coefficients, coefficients_sum = 0.0f, sigma; @@ -138,7 +126,6 @@ create_coefficients (UfoNonLocalMeansTaskPrivate *priv) coefficients[i] /= coefficients_sum; } - release_coefficients (priv); priv->window_mem = clCreateBuffer (priv->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof (cl_float) * wsize * wsize, @@ -526,8 +513,8 @@ ufo_non_local_means_task_get_requisition (UfoTask *task, priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue); g_value_unset (max_work_group_size_gvalue); } - if (priv->use_window && !priv->window_mem) { - create_coefficients (priv); + if (!priv->fast && priv->use_window && !priv->window_mem) { + create_gaussian_window (priv); } if (priv->cropped_size[0] != requisition->dims[0] || priv->cropped_size[1] != requisition->dims[1]) { priv->cropped_size[0] = requisition->dims[0]; @@ -831,7 +818,7 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context)); priv->context = NULL; } - release_coefficients (priv); + release_gaussian_window (priv); release_fast_buffers (priv); G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object); -- cgit v1.2.1 From acbc885113a424f027a7416249539a934d3059c2 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Thu, 1 Aug 2019 08:23:46 +0200 Subject: NLM: add test --- tests/CMakeLists.txt | 3 +++ tests/meson.build | 3 ++- tests/test-nlm.sh | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100755 tests/test-nlm.sh diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ddb567c..8de94f1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -20,3 +20,6 @@ add_test(test_177 add_test(test_core_149 ${BASH} "${CMAKE_CURRENT_SOURCE_DIR}/test-core-149.sh") + +add_test(test_nlm + ${BASH} "${CMAKE_CURRENT_SOURCE_DIR}/test-nlm.sh") diff --git a/tests/meson.build b/tests/meson.build index f95722f..6b720d2 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -3,7 +3,8 @@ tests = [ 'test-153', 'test-161', 'test-core-149', - 'test-file-write-regression' + 'test-file-write-regression', + 'test-nlm' ] tiffinfo = find_program('tiffinfo', required : false) diff --git a/tests/test-nlm.sh b/tests/test-nlm.sh new file mode 100755 index 0000000..09b75dd --- /dev/null +++ b/tests/test-nlm.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +python -c "import numpy; import tifffile; tifffile.imsave('nlm-noise-input.tif', +numpy.random.normal(100., 10., size=(128, 128)).astype(numpy.float32))" + +ufo-launch -q read path=nlm-noise-input.tif ! non-local-means patch-radius=3 search-radius=5 fast=False estimate-sigma=True addressing-mode=mirrored_repeat window=False ! write filename=ufo-nlm-slow.tif + +ufo-launch -q read path=nlm-noise-input.tif ! non-local-means patch-radius=3 search-radius=5 fast=True estimate-sigma=True addressing-mode=mirrored_repeat ! write filename=ufo-nlm-fast.tif + +# Fast and slow difference with respect to the mean must be less than 0.001 % +python -c "import sys; import tifffile; a = tifffile.imread('ufo-nlm-slow.tif'); b = tifffile.imread('ufo-nlm-fast.tif'); sys.exit(int(abs(a - b).max() > 1e-3))" -- cgit v1.2.1 From 861c41d886ccb4feec36a9387b05e2f5630fba33 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Fri, 2 Aug 2019 16:13:44 +0200 Subject: NLM: Make sure local size is power of 2 --- src/ufo-non-local-means-task.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 86857d2..3a9b1a7 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -71,6 +71,15 @@ enum { static GParamSpec *properties[N_PROPERTIES] = { NULL, }; +static gsize +compute_closest_smaller_power_of_2 (gsize value) +{ + gdouble integer; + modf (log2 (value), &integer); + + return (gsize) pow (2, integer); +} + static gint compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) { @@ -79,8 +88,7 @@ compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) /* Compute global and local dimensions for the cumsum kernel */ /* First make sure local_width is a power of 2 */ - modf (log2 (priv->max_work_group_size), &integer); - local_width = (gint) pow (2, integer); + local_width = (gint) compute_closest_smaller_power_of_2 (priv->max_work_group_size); if (local_width > 4) { /* Empirically determined value on NVIDIA cards */ local_width /= 4; @@ -559,8 +567,7 @@ compute_sigma (UfoNonLocalMeansTaskPrivate *priv, cl_mem out_mem) { gsize n = priv->cropped_size[0] * priv->cropped_size[1]; - gsize num_groups = MIN (priv->max_work_group_size, NUM_CHUNKS (n, priv->max_work_group_size)); - gsize num_group_iterations, global_size; + gsize local_size, num_groups, num_group_iterations, global_size; gfloat *result, sum = 0.0f; cl_int err; cl_mem group_sums; @@ -574,18 +581,23 @@ compute_sigma (UfoNonLocalMeansTaskPrivate *priv, ufo_profiler_call (profiler, cmd_queue, priv->convolution_kernel, 2, priv->cropped_size, NULL); /* Now compute partial sums of the convolved image. */ + /* Compute global and local dimensions for the cumsum kernel */ + /* Make sure local_size is a power of 2 */ + local_size = compute_closest_smaller_power_of_2 (priv->max_work_group_size); /* Number of iterations of every group is given by the number of pixels * divided by the number of pixels *num_groups* can process. */ - num_group_iterations = NUM_CHUNKS (n, priv->max_work_group_size * num_groups); + num_groups = MIN (local_size, NUM_CHUNKS (n, local_size)); + num_group_iterations = NUM_CHUNKS (n, local_size * num_groups); /* The real number of groups is given by the number of pixels * divided by the group size and the number of group iterations. */ - num_groups = NUM_CHUNKS (n, num_group_iterations * priv->max_work_group_size); - global_size = num_groups * priv->max_work_group_size; + num_groups = NUM_CHUNKS (n, num_group_iterations * local_size); + global_size = num_groups * local_size; - g_debug (" n: %lu", n); - g_debug (" num groups: %lu", num_groups); - g_debug ("group iterations: %lu", num_group_iterations); - g_debug (" kernel dims: %lu", global_size); + g_debug (" n: %lu", n); + g_debug (" num groups: %lu", num_groups); + g_debug (" group iterations: %lu", num_group_iterations); + g_debug ("kernel global size: %lu", global_size); + g_debug (" kernel local size: %lu", local_size); result = g_malloc0 (sizeof (cl_float) * num_groups); group_sums = clCreateBuffer (priv->context, @@ -598,10 +610,10 @@ compute_sigma (UfoNonLocalMeansTaskPrivate *priv, UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 0, sizeof (cl_mem), &out_mem)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 1, sizeof (cl_mem), &group_sums)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 2, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 3, sizeof (cl_float) * priv->max_work_group_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 3, sizeof (cl_float) * local_size, NULL)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 4, sizeof (gsize), &n)); UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 5, sizeof (gint), &num_group_iterations)); - ufo_profiler_call (profiler, cmd_queue, priv->sum_kernel, 1, &global_size, &priv->max_work_group_size); + ufo_profiler_call (profiler, cmd_queue, priv->sum_kernel, 1, &global_size, &local_size); clEnqueueReadBuffer (cmd_queue, group_sums, -- cgit v1.2.1 From 1028504583fe0ba72ecce004610e50a6eca42474 Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Mon, 5 Aug 2019 08:58:37 +0200 Subject: NLM: Put common things to src/common --- src/CMakeLists.txt | 4 ++ src/common/ufo-common.c | 104 ++++++++++++++++++++++++++++++++++++++++ src/common/ufo-common.h | 41 ++++++++++++++++ src/common/ufo-math.c | 9 ++++ src/common/ufo-math.h | 3 ++ src/meson.build | 15 +++++- src/ufo-non-local-means-task.c | 105 +++++++---------------------------------- 7 files changed, 191 insertions(+), 90 deletions(-) create mode 100644 src/common/ufo-common.c create mode 100644 src/common/ufo-common.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4bc70c..6f69f19 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -127,6 +127,10 @@ set(general_backproject_aux_SRCS common/ufo-scarray.c common/ufo-ctgeometry.c) +set(non_local_means_aux_SRCS + common/ufo-math.c + common/ufo-common.c) + file(GLOB ufofilter_KERNELS "kernels/*.cl") #}}} #{{{ Variables diff --git a/src/common/ufo-common.c b/src/common/ufo-common.c new file mode 100644 index 0000000..f157cf4 --- /dev/null +++ b/src/common/ufo-common.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2015-2019 Karlsruhe Institute of Technology + * + * This file is part of Ufo. + * + * This library is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ + +#include +#include +#include "ufo-math.h" +#include "ufo-common.h" + +gfloat +ufo_common_estimate_sigma (cl_kernel convolution_kernel, + cl_kernel sum_kernel, + cl_command_queue cmd_queue, + cl_sampler sampler, + UfoProfiler *profiler, + cl_mem input_image, + cl_mem out_mem, + const gsize max_work_group_size, + const gsize *global_size) +{ + gsize n = global_size[0] * global_size[1]; + gsize local_size, num_groups, global_size_1D; + gint num_group_iterations; + gfloat *result, sum = 0.0f; + cl_int err; + cl_mem group_sums; + cl_context context; + + clGetCommandQueueInfo (cmd_queue, CL_QUEUE_CONTEXT, sizeof (cl_context), &context, NULL); + + /* First compute the convolution of the input with the difference of + * laplacians. + */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 1, sizeof (cl_sampler), &sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (convolution_kernel, 2, sizeof (cl_mem), &out_mem)); + ufo_profiler_call (profiler, cmd_queue, convolution_kernel, 2, global_size, NULL); + + /* Now compute partial sums of the convolved image. */ + /* Compute global and local dimensions for the cumsum kernel */ + /* Make sure local_size is a power of 2 */ + local_size = ufo_math_compute_closest_smaller_power_of_2 (max_work_group_size); + /* Number of iterations of every group is given by the number of pixels + * divided by the number of pixels *num_groups* can process. */ + num_groups = MIN (local_size, UFO_MATH_NUM_CHUNKS (n, local_size)); + num_group_iterations = UFO_MATH_NUM_CHUNKS (n, local_size * num_groups); + /* The real number of groups is given by the number of pixels + * divided by the group size and the number of group iterations. */ + num_groups = UFO_MATH_NUM_CHUNKS (n, num_group_iterations * local_size); + global_size_1D = num_groups * local_size; + + g_debug (" n: %lu", n); + g_debug (" num groups: %lu", num_groups); + g_debug (" group iterations: %d", num_group_iterations); + g_debug ("kernel global size: %lu", global_size_1D); + g_debug (" kernel local size: %lu", local_size); + + result = g_malloc0 (sizeof (cl_float) * num_groups); + group_sums = clCreateBuffer (context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * num_groups, + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 0, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 1, sizeof (cl_mem), &group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 2, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 3, sizeof (cl_float) * local_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 4, sizeof (gsize), &n)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (sum_kernel, 5, sizeof (gint), &num_group_iterations)); + ufo_profiler_call (profiler, cmd_queue, sum_kernel, 1, &global_size_1D, &local_size); + + clEnqueueReadBuffer (cmd_queue, + group_sums, + CL_TRUE, + 0, sizeof (cl_float) * num_groups, + result, + 0, NULL, NULL); + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (group_sums)); + + /* Sum partial sums computed by the groups. */ + for (gsize i = 0; i < num_groups; i++) { + sum += result[i]; + } + g_free (result); + + return sqrt (G_PI_2) / (6 * (global_size[0] - 2.0f) * (global_size[1] - 2.0f)) * sum; +} diff --git a/src/common/ufo-common.h b/src/common/ufo-common.h new file mode 100644 index 0000000..f6e2349 --- /dev/null +++ b/src/common/ufo-common.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2015-2019 Karlsruhe Institute of Technology + * + * This file is part of Ufo. + * + * This library is free software: you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see . + */ + +#ifndef UFO_COMMON_H +#define UFO_COMMON_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include + +gfloat ufo_common_estimate_sigma (cl_kernel convolution_kernel, + cl_kernel sum_kernel, + cl_command_queue cmd_queue, + cl_sampler sampler, + UfoProfiler *profiler, + cl_mem input_image, + cl_mem out_mem, + const gsize max_work_group_size, + const gsize *global_size); + +#endif diff --git a/src/common/ufo-math.c b/src/common/ufo-math.c index 78b7dda..725b7e6 100644 --- a/src/common/ufo-math.c +++ b/src/common/ufo-math.c @@ -140,3 +140,12 @@ ufo_array_minimum (gdouble *array, gint num_values) { return find_extremum (array, num_values, 1); } + +gsize +ufo_math_compute_closest_smaller_power_of_2 (gsize value) +{ + gdouble integer; + modf (log2 (value), &integer); + + return (gsize) pow (2, integer); +} diff --git a/src/common/ufo-math.h b/src/common/ufo-math.h index 70084ce..8b283e1 100644 --- a/src/common/ufo-math.h +++ b/src/common/ufo-math.h @@ -24,6 +24,8 @@ #define UFO_MATH_EPSILON 1e-7 #define UFO_MATH_ARE_ALMOST_EQUAL(a, b) (ABS ((a) - (b)) < UFO_MATH_EPSILON) +#define UFO_MATH_NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1) + typedef struct { gdouble x, y, z; @@ -54,5 +56,6 @@ gdouble ufo_array_minimum (gdouble *array, gdouble ufo_clip_value (gdouble value, gdouble minimum, gdouble maximum); +gsize ufo_math_compute_closest_smaller_power_of_2 (gsize value); #endif diff --git a/src/meson.build b/src/meson.build index eb981b2..4cd4bdc 100644 --- a/src/meson.build +++ b/src/meson.build @@ -48,7 +48,6 @@ plugins = [ 'merge', 'metaballs', 'monitor', - 'non-local-means', 'null', 'opencl', 'opencl-reduce', @@ -171,6 +170,20 @@ shared_module('conebeamprojectionweight', install_dir: plugin_install_dir, ) +# non local means + +shared_module('nonlocalmeans', + sources: [ + 'ufo-non-local-means-task.c', + 'common/ufo-math.c', + 'common/ufo-common.c', + ], + dependencies: deps, + name_prefix: 'libufofilter', + install: true, + install_dir: plugin_install_dir, +) + # fft plugins have_clfft = clfft_dep.found() diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index 3a9b1a7..a0eff2c 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -25,10 +25,11 @@ #include #include "ufo-non-local-means-task.h" +#include "common/ufo-math.h" #include "common/ufo-addressing.h" +#include "common/ufo-common.h" #define PIXELS_PER_THREAD 4 -#define NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1) struct _UfoNonLocalMeansTaskPrivate { guint search_radius; @@ -71,15 +72,6 @@ enum { static GParamSpec *properties[N_PROPERTIES] = { NULL, }; -static gsize -compute_closest_smaller_power_of_2 (gsize value) -{ - gdouble integer; - modf (log2 (value), &integer); - - return (gsize) pow (2, integer); -} - static gint compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) { @@ -88,7 +80,7 @@ compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) /* Compute global and local dimensions for the cumsum kernel */ /* First make sure local_width is a power of 2 */ - local_width = (gint) compute_closest_smaller_power_of_2 (priv->max_work_group_size); + local_width = (gint) ufo_math_compute_closest_smaller_power_of_2 (priv->max_work_group_size); if (local_width > 4) { /* Empirically determined value on NVIDIA cards */ local_width /= 4; @@ -253,19 +245,19 @@ compute_cumsum (UfoNonLocalMeansTaskPrivate *priv, * This is not be the final number of groups, it's just used to compute the * number of iterations of every group. */ - num_groups = MIN (local_width, NUM_CHUNKS (width, local_width)); + num_groups = MIN (local_width, UFO_MATH_NUM_CHUNKS (width, local_width)); /* Number of iterations of every group is given by the number of pixels * divided by the number of pixels *num_groups* can process. */ - num_group_iterations = NUM_CHUNKS (width, local_width * num_groups); + num_group_iterations = UFO_MATH_NUM_CHUNKS (width, local_width * num_groups); /* Finally, the real number of groups is given by the number of pixels * divided by the group size and the number of group iterations. */ - num_groups = NUM_CHUNKS (width, num_group_iterations * local_width); + num_groups = UFO_MATH_NUM_CHUNKS (width, num_group_iterations * local_width); /* Cache size must be larger by *local_size* / 16 because of the bank * conflicts avoidance. Additionally, +1 is needed because of the shifted * access to the local memory. */ - cache_size = sizeof (cl_float) * (local_width + NUM_CHUNKS (local_width, 16) + 1); + cache_size = sizeof (cl_float) * (local_width + UFO_MATH_NUM_CHUNKS (local_width, 16) + 1); cumsum_global_size[0] = num_groups * local_width / 2; cumsum_global_size[1] = height; block_sums_global_size[0] = local_width / 2; @@ -559,79 +551,6 @@ ufo_non_local_means_task_get_mode (UfoTask *task) return UFO_TASK_MODE_PROCESSOR | UFO_TASK_MODE_GPU; } -static gfloat -compute_sigma (UfoNonLocalMeansTaskPrivate *priv, - cl_command_queue cmd_queue, - UfoProfiler *profiler, - cl_mem input_image, - cl_mem out_mem) -{ - gsize n = priv->cropped_size[0] * priv->cropped_size[1]; - gsize local_size, num_groups, num_group_iterations, global_size; - gfloat *result, sum = 0.0f; - cl_int err; - cl_mem group_sums; - - /* First compute the convolution of the input with the difference of - * laplacians. - */ - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 0, sizeof (cl_mem), &input_image)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 1, sizeof (cl_sampler), &priv->sampler)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->convolution_kernel, 2, sizeof (cl_mem), &out_mem)); - ufo_profiler_call (profiler, cmd_queue, priv->convolution_kernel, 2, priv->cropped_size, NULL); - - /* Now compute partial sums of the convolved image. */ - /* Compute global and local dimensions for the cumsum kernel */ - /* Make sure local_size is a power of 2 */ - local_size = compute_closest_smaller_power_of_2 (priv->max_work_group_size); - /* Number of iterations of every group is given by the number of pixels - * divided by the number of pixels *num_groups* can process. */ - num_groups = MIN (local_size, NUM_CHUNKS (n, local_size)); - num_group_iterations = NUM_CHUNKS (n, local_size * num_groups); - /* The real number of groups is given by the number of pixels - * divided by the group size and the number of group iterations. */ - num_groups = NUM_CHUNKS (n, num_group_iterations * local_size); - global_size = num_groups * local_size; - - g_debug (" n: %lu", n); - g_debug (" num groups: %lu", num_groups); - g_debug (" group iterations: %lu", num_group_iterations); - g_debug ("kernel global size: %lu", global_size); - g_debug (" kernel local size: %lu", local_size); - - result = g_malloc0 (sizeof (cl_float) * num_groups); - group_sums = clCreateBuffer (priv->context, - CL_MEM_READ_WRITE, - sizeof (cl_float) * num_groups, - NULL, - &err); - UFO_RESOURCES_CHECK_CLERR (err); - - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 0, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 1, sizeof (cl_mem), &group_sums)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 2, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 3, sizeof (cl_float) * local_size, NULL)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 4, sizeof (gsize), &n)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->sum_kernel, 5, sizeof (gint), &num_group_iterations)); - ufo_profiler_call (profiler, cmd_queue, priv->sum_kernel, 1, &global_size, &local_size); - - clEnqueueReadBuffer (cmd_queue, - group_sums, - CL_TRUE, - 0, sizeof (cl_float) * num_groups, - result, - 0, NULL, NULL); - UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (group_sums)); - - /* Sum partial sums computed by the groups. */ - for (gsize i = 0; i < num_groups; i++) { - sum += result[i]; - } - g_free (result); - - return sqrt (G_PI_2) / (6 * (priv->cropped_size[0] - 2.0f) * (priv->cropped_size[1] - 2.0f)) * sum; -} - static gboolean ufo_non_local_means_task_process (UfoTask *task, UfoBuffer **inputs, @@ -665,7 +584,15 @@ ufo_non_local_means_task_process (UfoTask *task, /* Use out_mem for the convolution, it's not necessary after the * computation and can be re-used by the de-noising itself. */ - estimated_sigma = compute_sigma (priv, cmd_queue, profiler, in_mem, out_mem); + estimated_sigma = ufo_common_estimate_sigma (priv->convolution_kernel, + priv->sum_kernel, + cmd_queue, + priv->sampler, + profiler, + in_mem, + out_mem, + priv->max_work_group_size, + priv->cropped_size); g_debug ("Estimated sigma: %g", estimated_sigma); if (priv->h <= 0.0f) { priv->h = estimated_sigma; -- cgit v1.2.1 From c94277081c5c0d6cb9f803bc96523dc7ff73a53a Mon Sep 17 00:00:00 2001 From: Tomas Farago Date: Wed, 5 Feb 2020 10:23:18 +0100 Subject: nlm: use fast algorithm by default --- src/ufo-non-local-means-task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index a0eff2c..4848d42 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -863,6 +863,6 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->max_work_group_size = 0; self->priv->use_window = TRUE; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; - self->priv->fast = FALSE; + self->priv->fast = TRUE; self->priv->estimate_sigma = FALSE; } -- cgit v1.2.1