/*
* Copyright (C) 2011-2015 Karlsruhe Institute of Technology
*
* This file is part of Ufo.
*
* This library is free software: you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*/
#ifdef __APPLE__
#include
#else
#include
#endif
#include
#include "ufo-non-local-means-task.h"
#include "common/ufo-math.h"
#include "common/ufo-addressing.h"
#include "common/ufo-common.h"
#define PIXELS_PER_THREAD 4
struct _UfoNonLocalMeansTaskPrivate {
guint search_radius;
guint patch_radius;
gsize max_work_group_size, cropped_size[2], padded_size[2];
gint padding;
gfloat h;
gfloat sigma;
gboolean use_window;
gboolean fast;
gboolean estimate_sigma;
cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel,
convolution_kernel, sum_kernel;
cl_sampler sampler;
cl_context context;
cl_mem window_mem, group_sums, aux_mem[2], weights_mem;
AddressingMode addressing_mode;
};
static void ufo_task_interface_init (UfoTaskIface *iface);
G_DEFINE_TYPE_WITH_CODE (UfoNonLocalMeansTask, ufo_non_local_means_task, UFO_TYPE_TASK_NODE,
G_IMPLEMENT_INTERFACE (UFO_TYPE_TASK,
ufo_task_interface_init))
#define UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE(obj) (G_TYPE_INSTANCE_GET_PRIVATE((obj), UFO_TYPE_NON_LOCAL_MEANS_TASK, UfoNonLocalMeansTaskPrivate))
enum {
PROP_0,
PROP_SEARCH_RADIUS,
PROP_PATCH_RADIUS,
PROP_H,
PROP_SIGMA,
PROP_WINDOW,
PROP_FAST,
PROP_ESTIMATE_SIGMA,
PROP_ADDRESSING_MODE,
N_PROPERTIES
};
static GParamSpec *properties[N_PROPERTIES] = { NULL, };
static gint
compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv)
{
gdouble integer;
gint local_width;
/* Compute global and local dimensions for the cumsum kernel */
/* First make sure local_width is a power of 2 */
local_width = (gint) ufo_math_compute_closest_smaller_power_of_2 (priv->max_work_group_size);
if (local_width > 4) {
/* Empirically determined value on NVIDIA cards */
local_width /= 4;
}
/* *local_width* is the minimum of the power-of-two-padded minimum of
* (width, height) and the desired local width.
*/
modf (log2 (MIN (priv->padded_size[0], priv->padded_size[1]) - 1.0f), &integer);
local_width = (gsize) MIN (local_width, pow (2, 1 + ((gint) integer)));
return local_width;
}
static void
release_gaussian_window (UfoNonLocalMeansTaskPrivate *priv)
{
if (priv->window_mem) {
UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->window_mem));
priv->window_mem = NULL;
}
}
static void
create_gaussian_window (UfoNonLocalMeansTaskPrivate *priv)
{
cl_int err;
gfloat *coefficients, coefficients_sum = 0.0f, sigma;
gsize wsize;
gint r;
wsize = 2 * priv->patch_radius + 1;
r = (gint) priv->patch_radius;
coefficients = g_malloc0 (sizeof (gfloat) * wsize * wsize);
sigma = priv->patch_radius / 2.0f;
for (gint y = 0; y < (gint) wsize; y++) {
for (gint x = 0; x < (gint) wsize; x++) {
coefficients[y * wsize + x] = exp (- ((x - r) * (x - r) + (y - r) * (y - r)) / (sigma * sigma * 2.0f));
coefficients_sum += coefficients[y * wsize + x];
}
}
for (guint i = 0; i < wsize * wsize; i++) {
coefficients[i] /= coefficients_sum;
}
priv->window_mem = clCreateBuffer (priv->context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof (cl_float) * wsize * wsize,
coefficients,
&err);
UFO_RESOURCES_CHECK_CLERR (err);
g_free (coefficients);
}
static void
release_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
{
if (priv->group_sums) {
UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->group_sums));
priv->group_sums = NULL;
}
if (priv->weights_mem) {
UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->weights_mem));
priv->weights_mem = NULL;
}
for (gint i = 0; i < 2; i++) {
if (priv->aux_mem[i]) {
UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->aux_mem[i]));
priv->aux_mem[i] = NULL;
}
}
}
static void
create_fast_buffers (UfoNonLocalMeansTaskPrivate *priv)
{
cl_int err;
gint local_width = compute_cumsum_local_width (priv);
priv->group_sums = clCreateBuffer (priv->context,
CL_MEM_READ_WRITE,
sizeof (cl_float) * local_width * MAX (priv->padded_size[0], priv->padded_size[1]),
NULL,
&err);
UFO_RESOURCES_CHECK_CLERR (err);
priv->weights_mem = clCreateBuffer (priv->context,
CL_MEM_READ_WRITE,
sizeof (cl_float) * priv->cropped_size[0] * priv->cropped_size[1],
NULL,
&err);
for (gint i = 0; i < 2; i++) {
priv->aux_mem[i] = clCreateBuffer (priv->context,
CL_MEM_READ_WRITE,
sizeof (cl_float) * priv->padded_size[0] * priv->padded_size[1],
NULL,
&err);
UFO_RESOURCES_CHECK_CLERR (err);
}
UFO_RESOURCES_CHECK_CLERR (err);
}
static void
transpose (UfoNonLocalMeansTaskPrivate *priv,
cl_command_queue cmd_queue,
UfoProfiler *profiler,
cl_mem in_mem,
cl_mem out_mem,
const gint width,
const gint height)
{
gsize global_size[2];
gsize local_size[2] = {32, 32 / PIXELS_PER_THREAD};
static gint iteration_number = 0;
while (local_size[0] * local_size[1] > priv->max_work_group_size) {
local_size[0] /= 2;
local_size[1] /= 2;
}
global_size[0] = ((width - 1) / local_size[0] + 1) * local_size[0];
global_size[1] = ((height - 1) / local_size[1] / PIXELS_PER_THREAD + 1) * local_size[1];
if (!iteration_number) {
g_debug ("Image size: %d x %d", width, height);
g_debug ("Transpose global work group size: %lu x %lu", global_size[0], global_size[1]);
g_debug ("Transpose local work group size: %lu x %lu", local_size[0], local_size[1]);
iteration_number++;
}
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 0, sizeof (cl_mem), &in_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 1, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 2,
(local_size[0] + 1) * local_size[1] * PIXELS_PER_THREAD * sizeof (cl_float), NULL));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 3, sizeof (cl_int), &width));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 4, sizeof (cl_int), &height));
ufo_profiler_call (profiler, cmd_queue, priv->transpose_kernel, 2, global_size, local_size);
}
static void
compute_cumsum (UfoNonLocalMeansTaskPrivate *priv,
cl_command_queue cmd_queue,
UfoProfiler *profiler,
cl_mem in_mem,
cl_mem out_mem,
const gint width,
const gint height)
{
gint local_width, num_groups, num_group_iterations, one = 1;
gsize cumsum_global_size[2], block_sums_global_size[2], spread_global_size[2],
local_size[2], spread_local_size[2];
gsize cache_size;
static gint iteration_number = 0;
local_width = compute_cumsum_local_width (priv);
/* Number of groups we need to process *width* pixels. If it is more than
* *local_width* then use only that number and every group will process more
* successive blocks given by group size. This is necessary in order to
* avoid recursion in the spreading phase of the group sums. The local cache
* memory is limited to *local_width*, every group stores its sum into an
* auxiliary buffer, which is then also summed (only one iteration needed,
* because there is only one group in the auxiliary buffer since we limit
* the number of groups to *local_width*.
* This is not be the final number of groups, it's just used to compute the
* number of iterations of every group.
*/
num_groups = MIN (local_width, UFO_MATH_NUM_CHUNKS (width, local_width));
/* Number of iterations of every group is given by the number of pixels
* divided by the number of pixels *num_groups* can process. */
num_group_iterations = UFO_MATH_NUM_CHUNKS (width, local_width * num_groups);
/* Finally, the real number of groups is given by the number of pixels
* divided by the group size and the number of group iterations. */
num_groups = UFO_MATH_NUM_CHUNKS (width, num_group_iterations * local_width);
/* Cache size must be larger by *local_size* / 16 because of the bank
* conflicts avoidance. Additionally, +1 is needed because of the shifted
* access to the local memory.
*/
cache_size = sizeof (cl_float) * (local_width + UFO_MATH_NUM_CHUNKS (local_width, 16) + 1);
cumsum_global_size[0] = num_groups * local_width / 2;
cumsum_global_size[1] = height;
block_sums_global_size[0] = local_width / 2;
block_sums_global_size[1] = height;
spread_global_size[0] = (num_groups - 1) * local_width;
spread_global_size[1] = height;
local_size[0] = local_width / 2;
local_size[1] = 1;
spread_local_size[0] = local_width;
spread_local_size[1] = 1;
if (!iteration_number) {
g_debug (" width: %d", width);
g_debug (" local width: %d", local_width);
g_debug (" num groups: %d", num_groups);
g_debug ("group iterations: %d", num_group_iterations);
g_debug (" kernel dims: %lu %lu %lu %lu", cumsum_global_size[0], cumsum_global_size[1], local_size[0], local_size[1]);
g_debug (" cache size: %lu", cache_size);
iteration_number++;
}
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &in_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), &priv->group_sums));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &num_group_iterations));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &width));
ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, cumsum_global_size, local_size);
if (num_groups > 1) {
/* If there are more than one group, we need to spread the partial sums
* of the groups to successive groups. First compute the sums of the
* groups and then spread them to respective pixels in them. Because of
* our choice of number of iterations per group above, the partial sums
* have to be summed only once without recursion.
*/
/* First sum the partial sums. */
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &priv->group_sums));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &priv->group_sums));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), NULL));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &one));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &local_width));
ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, block_sums_global_size, local_size);
/* Spread them across all pixels. */
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 0, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 1, sizeof (cl_mem), &priv->group_sums));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 2, sizeof (gint), &num_group_iterations));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 3, sizeof (gint), &width));
ufo_profiler_call (profiler, cmd_queue, priv->spread_kernel, 2, spread_global_size, spread_local_size);
}
}
static void
compute_sdx (UfoNonLocalMeansTaskPrivate *priv,
cl_command_queue cmd_queue,
UfoProfiler *profiler,
cl_mem in_mem,
cl_mem out_mem,
const gint dx,
const gint dy)
{
gfloat var = priv->sigma * priv->sigma;
/* First compute the shifted MSE */
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 0, sizeof (cl_mem), &in_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 1, sizeof (cl_mem), &priv->aux_mem[0]));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 2, sizeof (cl_sampler), &priv->sampler));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 3, sizeof (gint), &dx));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 4, sizeof (gint), &dy));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 5, sizeof (gint), &priv->padding));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 6, sizeof (gint), &var));
ufo_profiler_call (profiler, cmd_queue, priv->mse_kernel, 2, priv->padded_size, NULL);
/* Horizontal cumsum and transposition */
compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[0], priv->padded_size[1]);
transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
priv->padded_size[0], priv->padded_size[1]);
/* 2D cumsum is separable, so compute the horizontal cumsum of the transposed horizontally cumsummed image */
compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[1], priv->padded_size[0]);
transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0],
priv->padded_size[1], priv->padded_size[0]);
}
static void
process_shift (UfoNonLocalMeansTaskPrivate *priv,
cl_command_queue cmd_queue,
UfoProfiler *profiler,
cl_mem input_image,
cl_mem integral_mem,
cl_mem out_mem,
const gint dx,
const gint dy)
{
gint patch_size = 2 * priv->patch_radius + 1;
gfloat coeff = 1.0f / (priv->h * priv->h * patch_size * patch_size);
gint is_conjugate = 0;
/* Compute r(x) = w(x) * f(x + dx) */
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
if (dx != 0 || dy != 0) {
/* Compute r(x + dx) = w(x + dx) * f(x), which cannot be computed at
* once in the above kernel call because one work item would write to
* two global memory locations, thus creating a race condition. */
is_conjugate = 1;
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate));
ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL);
}
}
static void
compute_nlm_fast (UfoNonLocalMeansTaskPrivate *priv,
cl_command_queue cmd_queue,
UfoProfiler *profiler,
cl_mem in_mem,
cl_mem out_mem)
{
gint dx, dy, sr;
sr = (gint) priv->search_radius;
/* Every pixel computes its own result and spreads it to the pixel y + dy,
* thus we start at dy = 0 every pixel gets its dy < 0 value from the pixel
* y - dy. For dy = 0, compute only pixels to the right, so the negative
* values are obtained from the pixels to the left (analogous to the dy
* case).
*/
for (dy = 0; dy < sr + 1; dy++) {
for (dx = dy == 0 ? 0 : -sr; dx < sr + 1; dx++) {
compute_sdx (priv, cmd_queue, profiler, in_mem, out_mem, dx, dy);
process_shift (priv, cmd_queue, profiler, in_mem, priv->aux_mem[0], out_mem, dx, dy);
}
}
/* Now we have the sum of results and weights, divide them to get the
* result.
*/
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 0, sizeof (cl_mem), &priv->weights_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 1, sizeof (cl_mem), &out_mem));
ufo_profiler_call (profiler, cmd_queue, priv->divide_kernel, 2, priv->cropped_size, NULL);
}
UfoNode *
ufo_non_local_means_task_new (void)
{
return UFO_NODE (g_object_new (UFO_TYPE_NON_LOCAL_MEANS_TASK, NULL));
}
static void
ufo_non_local_means_task_setup (UfoTask *task,
UfoResources *resources,
GError **error)
{
UfoNonLocalMeansTaskPrivate *priv;
cl_int err;
priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
if (priv->fast) {
priv->mse_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "compute_shifted_mse", NULL, error);
priv->cumsum_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "cumsum", NULL, error);
priv->spread_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "spread_block_sums", NULL, error);
priv->transpose_kernel = ufo_resources_get_kernel (resources, "transpose.cl", "transpose_shared", NULL, error);
priv->weight_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "process_shift", NULL, error);
priv->divide_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "divide_inplace", NULL, error);
} else {
priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error);
}
priv->convolution_kernel = ufo_resources_get_kernel (resources, "estimate-noise.cl", "convolve_abs_laplacian_diff", NULL, error);
priv->sum_kernel = ufo_resources_get_kernel (resources, "reductor.cl", "reduce_M_SUM", NULL, error);
priv->window_mem = NULL;
priv->group_sums = NULL;
priv->weights_mem = NULL;
for (gint i = 0; i < 2; i++) {
priv->aux_mem[i] = NULL;
}
if (priv->kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error);
}
if (priv->mse_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->mse_kernel), error);
}
if (priv->cumsum_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->cumsum_kernel), error);
}
if (priv->spread_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->spread_kernel), error);
}
if (priv->transpose_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->transpose_kernel), error);
}
if (priv->weight_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->weight_kernel), error);
}
if (priv->divide_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error);
}
if (priv->convolution_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->convolution_kernel), error);
}
if (priv->sum_kernel) {
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->sum_kernel), error);
}
priv->context = ufo_resources_get_context (resources);
UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error);
priv->sampler = clCreateSampler (priv->context,
(cl_bool) TRUE,
priv->addressing_mode,
CL_FILTER_NEAREST,
&err);
UFO_RESOURCES_CHECK_CLERR (err);
}
static void
ufo_non_local_means_task_get_requisition (UfoTask *task,
UfoBuffer **inputs,
UfoRequisition *requisition,
GError **error)
{
UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
UfoGpuNode *node;
GValue *max_work_group_size_gvalue;
ufo_buffer_get_requisition (inputs[0], requisition);
if (priv->max_work_group_size == 0) {
node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE);
priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue);
g_value_unset (max_work_group_size_gvalue);
}
if (!priv->fast && priv->use_window && !priv->window_mem) {
create_gaussian_window (priv);
}
if (priv->cropped_size[0] != requisition->dims[0] || priv->cropped_size[1] != requisition->dims[1]) {
priv->cropped_size[0] = requisition->dims[0];
priv->cropped_size[1] = requisition->dims[1];
if (priv->fast) {
priv->padding = priv->search_radius + priv->patch_radius + 1;
priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding;
priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding;
if (priv->group_sums) {
/* Buffers exist, which means size changed, release first. */
release_fast_buffers (priv);
}
create_fast_buffers (priv);
}
}
}
static guint
ufo_non_local_means_task_get_num_inputs (UfoTask *task)
{
return 1;
}
static guint
ufo_non_local_means_task_get_num_dimensions (UfoTask *task,
guint input)
{
return 2;
}
static UfoTaskMode
ufo_non_local_means_task_get_mode (UfoTask *task)
{
return UFO_TASK_MODE_PROCESSOR | UFO_TASK_MODE_GPU;
}
static gboolean
ufo_non_local_means_task_process (UfoTask *task,
UfoBuffer **inputs,
UfoBuffer *output,
UfoRequisition *requisition)
{
UfoNonLocalMeansTaskPrivate *priv;
UfoGpuNode *node;
UfoProfiler *profiler;
UfoRequisition in_req;
cl_command_queue cmd_queue;
cl_mem in_mem;
cl_mem out_mem;
gfloat h, var, estimated_sigma;
gfloat fill_pattern = 0.0f;
guint num_processed;
priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task);
node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task)));
cmd_queue = ufo_gpu_node_get_cmd_queue (node);
in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue);
out_mem = ufo_buffer_get_device_array (output, cmd_queue);
profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task));
ufo_buffer_get_requisition (inputs[0], &in_req);
g_object_get (task, "num_processed", &num_processed, NULL);
/* Estimate sigma only from the first image in the stream in order to
* keep the results consistent.
*/
if (num_processed == 0 && (priv->h <= 0.0f || priv->estimate_sigma)) {
/* Use out_mem for the convolution, it's not necessary after the
* computation and can be re-used by the de-noising itself.
*/
estimated_sigma = ufo_common_estimate_sigma (priv->convolution_kernel,
priv->sum_kernel,
cmd_queue,
priv->sampler,
profiler,
in_mem,
out_mem,
priv->max_work_group_size,
priv->cropped_size);
g_debug ("Estimated sigma: %g", estimated_sigma);
if (priv->h <= 0.0f) {
priv->h = estimated_sigma;
}
if (priv->estimate_sigma) {
priv->sigma = estimated_sigma;
}
}
h = 1 / priv->h / priv->h;
var = priv->sigma * priv->sigma;
if (priv->fast) {
UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0,
sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
0, NULL, NULL));
UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, priv->weights_mem, &fill_pattern, sizeof (gfloat), 0,
sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1],
0, NULL, NULL));
compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem);
} else {
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var));
UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem));
ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL);
}
return TRUE;
}
static void
ufo_non_local_means_task_set_property (GObject *object,
guint property_id,
const GValue *value,
GParamSpec *pspec)
{
UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (object);
switch (property_id) {
case PROP_SEARCH_RADIUS:
priv->search_radius = g_value_get_uint (value);
break;
case PROP_PATCH_RADIUS:
priv->patch_radius = g_value_get_uint (value);
break;
case PROP_H:
priv->h = g_value_get_float (value);
break;
case PROP_SIGMA:
priv->sigma = g_value_get_float (value);
break;
case PROP_WINDOW:
priv->use_window = g_value_get_boolean (value);
break;
case PROP_FAST:
priv->fast = g_value_get_boolean (value);
break;
case PROP_ESTIMATE_SIGMA:
priv->estimate_sigma = g_value_get_boolean (value);
break;
case PROP_ADDRESSING_MODE:
priv->addressing_mode = g_value_get_enum (value);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
break;
}
}
static void
ufo_non_local_means_task_get_property (GObject *object,
guint property_id,
GValue *value,
GParamSpec *pspec)
{
UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (object);
switch (property_id) {
case PROP_SEARCH_RADIUS:
g_value_set_uint (value, priv->search_radius);
break;
case PROP_PATCH_RADIUS:
g_value_set_uint (value, priv->patch_radius);
break;
case PROP_H:
g_value_set_float (value, priv->h);
break;
case PROP_SIGMA:
g_value_set_float (value, priv->sigma);
break;
case PROP_WINDOW:
g_value_set_boolean (value, priv->use_window);
break;
case PROP_FAST:
g_value_set_boolean (value, priv->fast);
break;
case PROP_ESTIMATE_SIGMA:
g_value_set_boolean (value, priv->estimate_sigma);
break;
case PROP_ADDRESSING_MODE:
g_value_set_enum (value, priv->addressing_mode);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
break;
}
}
static void
ufo_non_local_means_task_finalize (GObject *object)
{
UfoNonLocalMeansTaskPrivate *priv;
priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (object);
if (priv->kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel));
priv->kernel = NULL;
}
if (priv->mse_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->mse_kernel));
priv->mse_kernel = NULL;
}
if (priv->cumsum_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->cumsum_kernel));
priv->cumsum_kernel = NULL;
}
if (priv->spread_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->spread_kernel));
priv->spread_kernel = NULL;
}
if (priv->transpose_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->transpose_kernel));
priv->transpose_kernel = NULL;
}
if (priv->weight_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->weight_kernel));
priv->weight_kernel = NULL;
}
if (priv->divide_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel));
priv->divide_kernel = NULL;
}
if (priv->convolution_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->convolution_kernel));
priv->convolution_kernel = NULL;
}
if (priv->sum_kernel) {
UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->sum_kernel));
priv->sum_kernel = NULL;
}
if (priv->sampler) {
UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler));
priv->sampler = NULL;
}
if (priv->context) {
UFO_RESOURCES_CHECK_CLERR (clReleaseContext (priv->context));
priv->context = NULL;
}
release_gaussian_window (priv);
release_fast_buffers (priv);
G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object);
}
static void
ufo_task_interface_init (UfoTaskIface *iface)
{
iface->setup = ufo_non_local_means_task_setup;
iface->get_num_inputs = ufo_non_local_means_task_get_num_inputs;
iface->get_num_dimensions = ufo_non_local_means_task_get_num_dimensions;
iface->get_mode = ufo_non_local_means_task_get_mode;
iface->get_requisition = ufo_non_local_means_task_get_requisition;
iface->process = ufo_non_local_means_task_process;
}
static void
ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass)
{
GObjectClass *oclass = G_OBJECT_CLASS (klass);
oclass->set_property = ufo_non_local_means_task_set_property;
oclass->get_property = ufo_non_local_means_task_get_property;
oclass->finalize = ufo_non_local_means_task_finalize;
properties[PROP_SEARCH_RADIUS] =
g_param_spec_uint ("search-radius",
"Search radius in pixels",
"Search radius in pixels",
1, 8192, 10,
G_PARAM_READWRITE);
properties[PROP_PATCH_RADIUS] =
g_param_spec_uint ("patch-radius",
"Patch radius in pixels",
"Patch radius in pixels",
1, 100, 3,
G_PARAM_READWRITE);
properties[PROP_H] =
g_param_spec_float ("h",
"Smoothing control parameter, should be around noise standard deviation or slightly less",
"Smoothing control parameter, should be around noise standard deviation or slightly less",
0.0f, G_MAXFLOAT, 0.1f,
G_PARAM_READWRITE);
properties[PROP_SIGMA] =
g_param_spec_float ("sigma",
"Noise standard deviation",
"Noise standard deviation",
0.0f, G_MAXFLOAT, 0.0f,
G_PARAM_READWRITE);
properties[PROP_WINDOW] =
g_param_spec_boolean ("window",
"Use Gaussian window by computing patch weights",
"Use Gaussian window by computing patch weights",
TRUE,
G_PARAM_READWRITE);
properties[PROP_FAST] =
g_param_spec_boolean ("fast",
"Use a faster version of the algorithm",
"Use a faster version of the algorithm",
TRUE,
G_PARAM_READWRITE);
properties[PROP_ESTIMATE_SIGMA] =
g_param_spec_boolean ("estimate-sigma",
"Estimate noise standard deviation",
"Estimate noise standard deviation",
FALSE,
G_PARAM_READWRITE);
properties[PROP_ADDRESSING_MODE] =
g_param_spec_enum ("addressing-mode",
"Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")",
"Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")",
g_enum_register_static ("ufo_nlm_addressing_mode", addressing_values),
CL_ADDRESS_MIRRORED_REPEAT,
G_PARAM_READWRITE);
for (guint i = PROP_0 + 1; i < N_PROPERTIES; i++)
g_object_class_install_property (oclass, i, properties[i]);
g_type_class_add_private (oclass, sizeof(UfoNonLocalMeansTaskPrivate));
}
static void
ufo_non_local_means_task_init(UfoNonLocalMeansTask *self)
{
self->priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE(self);
self->priv->search_radius = 10;
self->priv->patch_radius = 3;
self->priv->cropped_size[0] = 0;
self->priv->cropped_size[1] = 0;
self->priv->padded_size[0] = 0;
self->priv->padded_size[1] = 0;
self->priv->padding = -1;
self->priv->h = 0.0f;
self->priv->sigma = 0.0f;
self->priv->max_work_group_size = 0;
self->priv->use_window = TRUE;
self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT;
self->priv->fast = TRUE;
self->priv->estimate_sigma = FALSE;
}