diff options
Diffstat (limited to 'src/ufo-non-local-means-task.c')
-rw-r--r-- | src/ufo-non-local-means-task.c | 463 |
1 files changed, 447 insertions, 16 deletions
diff --git a/src/ufo-non-local-means-task.c b/src/ufo-non-local-means-task.c index ee9f2ad..4c14c67 100644 --- a/src/ufo-non-local-means-task.c +++ b/src/ufo-non-local-means-task.c @@ -27,17 +27,22 @@ #include "ufo-non-local-means-task.h" #include "common/ufo-addressing.h" +#define PIXELS_PER_THREAD 4 +#define NUM_CHUNKS(n, k) (((n) - 1) / (k) + 1) struct _UfoNonLocalMeansTaskPrivate { guint search_radius; guint patch_radius; + gsize max_work_group_size, cropped_size[2], padded_size[2]; + gint padding; gfloat h; gfloat sigma; gboolean use_window; - cl_kernel kernel; + gboolean fast; + cl_kernel kernel, mse_kernel, cumsum_kernel, spread_kernel, transpose_kernel, weight_kernel, divide_kernel; cl_sampler sampler; cl_context context; - cl_mem window_mem; + cl_mem window_mem, group_sums, aux_mem[2], weights_mem; AddressingMode addressing_mode; }; @@ -56,12 +61,36 @@ enum { PROP_H, PROP_SIGMA, PROP_WINDOW, + PROP_FAST, PROP_ADDRESSING_MODE, N_PROPERTIES }; static GParamSpec *properties[N_PROPERTIES] = { NULL, }; +static gint +compute_cumsum_local_width (UfoNonLocalMeansTaskPrivate *priv) +{ + gdouble integer; + gint local_width; + + /* Compute global and local dimensions for the cumsum kernel */ + /* First make sure local_width is a power of 2 */ + modf (log2 (priv->max_work_group_size), &integer); + local_width = (gint) pow (2, integer); + if (local_width > 4) { + /* Empirically determined value on NVIDIA cards */ + local_width /= 4; + } + /* *local_width* is the minimum of the power-of-two-padded minimum of + * (width, height) and the desired local width. + */ + modf (log2 (MIN (priv->padded_size[0], priv->padded_size[1]) - 1.0f), &integer); + local_width = (gsize) MIN (local_width, pow (2, 1 + ((gint) integer))); + + return local_width; +} + /** * release_coefficients: * @priv: UfoNonLocalMeansTaskPrivate @@ -116,6 +145,292 @@ create_coefficients (UfoNonLocalMeansTaskPrivate *priv) g_free (coefficients); } +static void +release_fast_buffers (UfoNonLocalMeansTaskPrivate *priv) +{ + if (priv->group_sums) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->group_sums)); + priv->group_sums = NULL; + } + if (priv->weights_mem) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->weights_mem)); + priv->weights_mem = NULL; + } + for (gint i = 0; i < 2; i++) { + if (priv->aux_mem[i]) { + UFO_RESOURCES_CHECK_CLERR (clReleaseMemObject (priv->aux_mem[i])); + priv->aux_mem[i] = NULL; + } + } +} + +static void +create_fast_buffers (UfoNonLocalMeansTaskPrivate *priv) +{ + cl_int err; + gint local_width = compute_cumsum_local_width (priv); + + priv->group_sums = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * local_width * MAX (priv->padded_size[0], priv->padded_size[1]), + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + + priv->weights_mem = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * priv->cropped_size[0] * priv->cropped_size[1], + NULL, + &err); + for (gint i = 0; i < 2; i++) { + priv->aux_mem[i] = clCreateBuffer (priv->context, + CL_MEM_READ_WRITE, + sizeof (cl_float) * priv->padded_size[0] * priv->padded_size[1], + NULL, + &err); + UFO_RESOURCES_CHECK_CLERR (err); + } + UFO_RESOURCES_CHECK_CLERR (err); +} + +static void +transpose (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint width, + const gint height) +{ + gsize global_size[2]; + gsize local_size[2] = {32, 32 / PIXELS_PER_THREAD}; + static gint iteration_number = 0; + + while (local_size[0] * local_size[1] > priv->max_work_group_size) { + local_size[0] /= 2; + local_size[1] /= 2; + } + global_size[0] = ((width - 1) / local_size[0] + 1) * local_size[0]; + global_size[1] = ((height - 1) / local_size[1] / PIXELS_PER_THREAD + 1) * local_size[1]; + if (!iteration_number) { + g_debug ("Image size: %d x %d", width, height); + g_debug ("Transpose global work group size: %lu x %lu", global_size[0], global_size[1]); + g_debug ("Transpose local work group size: %lu x %lu", local_size[0], local_size[1]); + iteration_number++; + } + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 2, + (local_size[0] + 1) * local_size[1] * PIXELS_PER_THREAD * sizeof (cl_float), NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 3, sizeof (cl_int), &width)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->transpose_kernel, 4, sizeof (cl_int), &height)); + ufo_profiler_call (profiler, cmd_queue, priv->transpose_kernel, 2, global_size, local_size); +} + +static void +compute_cumsum (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint width, + const gint height) +{ + gint local_width, num_groups, num_group_iterations, one = 1; + gsize cumsum_global_size[2], block_sums_global_size[2], spread_global_size[2], + local_size[2], spread_local_size[2]; + gsize cache_size; + static gint iteration_number = 0; + + local_width = compute_cumsum_local_width (priv); + /* Number of groups we need to process *width* pixels. If it is more than + * *local_width* then use only that number and every group will process more + * successive blocks given by group size. This is necessary in order to + * avoid recursion in the spreading phase of the group sums. The local cache + * memory is limited to *local_width*, every group stores its sum into an + * auxiliary buffer, which is then also summed (only one iteration needed, + * because there is only one group in the auxiliary buffer since we limit + * the number of groups to *local_width*. + * This is not be the final number of groups, it's just used to compute the + * number of iterations of every group. + */ + num_groups = MIN (local_width, NUM_CHUNKS (width, local_width)); + /* Number of iterations of every group is given by the number of pixels + * divided by the number of pixels *num_groups* can process. */ + num_group_iterations = NUM_CHUNKS (width, local_width * num_groups); + /* Finally, the real number of groups is given by the number of pixels + * divided by the group size and the number of group iterations. */ + num_groups = NUM_CHUNKS (width, num_group_iterations * local_width); + + /* Cache size must be larger by *local_size* / 16 because of the bank + * conflicts avoidance. Additionally, +1 is needed because of the shifted + * access to the local memory. + */ + cache_size = sizeof (cl_float) * (local_width + NUM_CHUNKS (local_width, 16) + 1); + cumsum_global_size[0] = num_groups * local_width / 2; + cumsum_global_size[1] = height; + block_sums_global_size[0] = local_width / 2; + block_sums_global_size[1] = height; + spread_global_size[0] = (num_groups - 1) * local_width; + spread_global_size[1] = height; + local_size[0] = local_width / 2; + local_size[1] = 1; + spread_local_size[0] = local_width; + spread_local_size[1] = 1; + if (!iteration_number) { + g_debug (" width: %d", width); + g_debug (" local width: %d", local_width); + g_debug (" num groups: %d", num_groups); + g_debug ("group iterations: %d", num_group_iterations); + g_debug (" kernel dims: %lu %lu %lu %lu", cumsum_global_size[0], cumsum_global_size[1], local_size[0], local_size[1]); + g_debug (" cache size: %lu", cache_size); + iteration_number++; + } + + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &num_group_iterations)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &width)); + ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, cumsum_global_size, local_size); + + if (num_groups > 1) { + /* If there are more than one group, we need to spread the partial sums + * of the groups to successive groups. First compute the sums of the + * groups and then spread them to respective pixels in them. Because of + * our choice of number of iterations per group above, the partial sums + * have to be summed only once without recursion. + */ + /* First sum the partial sums. */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 0, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 1, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 2, sizeof (cl_mem), NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 3, cache_size, NULL)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 4, sizeof (gint), &one)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->cumsum_kernel, 5, sizeof (gint), &local_width)); + ufo_profiler_call (profiler, cmd_queue, priv->cumsum_kernel, 2, block_sums_global_size, local_size); + + /* Spread them across all pixels. */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 0, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 1, sizeof (cl_mem), &priv->group_sums)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 2, sizeof (gint), &num_group_iterations)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->spread_kernel, 3, sizeof (gint), &width)); + ufo_profiler_call (profiler, cmd_queue, priv->spread_kernel, 2, spread_global_size, spread_local_size); + } +} + +static void +compute_sdx (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem, + const gint dx, + const gint dy) +{ + gfloat var = priv->sigma * priv->sigma; + + /* First compute the shifted MSE */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 1, sizeof (cl_mem), &priv->aux_mem[0])); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 2, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 3, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 4, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 5, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->mse_kernel, 6, sizeof (gint), &var)); + ufo_profiler_call (profiler, cmd_queue, priv->mse_kernel, 2, priv->padded_size, NULL); + + /* Horizontal cumsum and transposition */ + compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[0], priv->padded_size[1]); + transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0], + priv->padded_size[0], priv->padded_size[1]); + + /* 2D cumsum is separable, so compute the horizontal cumsum of the transposed horizontally cumsummed image */ + compute_cumsum (priv, cmd_queue, profiler, priv->aux_mem[0], priv->aux_mem[1], priv->padded_size[1], priv->padded_size[0]); + transpose (priv, cmd_queue, profiler, priv->aux_mem[1], priv->aux_mem[0], + priv->padded_size[1], priv->padded_size[0]); +} + +static void +process_shift (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem input_image, + cl_mem integral_mem, + cl_mem out_mem, + const gint dx, + const gint dy) +{ + gint patch_size = 2 * priv->patch_radius + 1; + gfloat coeff = 1.0f / (priv->h * priv->h * patch_size * patch_size); + gint is_conjugate = 0; + + /* Compute r(x) = w(x) * f(x + dx) */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate)); + ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL); + + if (dx != 0 || dy != 0) { + /* Compute r(x + dx) = w(x + dx) * f(x), which cannot be computed at + * once in the above kernel call because one work item would write to + * two global memory locations, thus creating a race condition. */ + is_conjugate = 1; + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 0, sizeof (cl_mem), &input_image)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 1, sizeof (cl_mem), &integral_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 2, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 3, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 4, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 5, sizeof (gint), &dx)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 6, sizeof (gint), &dy)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 7, sizeof (gint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 8, sizeof (gint), &priv->padding)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 9, sizeof (gfloat), &coeff)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->weight_kernel, 10, sizeof (gint), &is_conjugate)); + ufo_profiler_call (profiler, cmd_queue, priv->weight_kernel, 2, priv->cropped_size, NULL); + } +} + +static void +compute_nlm_fast (UfoNonLocalMeansTaskPrivate *priv, + cl_command_queue cmd_queue, + UfoProfiler *profiler, + cl_mem in_mem, + cl_mem out_mem) +{ + gint dx, dy, sr; + sr = (gint) priv->search_radius; + + /* Every pixel computes its own result and spreads it to the pixel y + dy, + * thus we start at dy = 0 every pixel gets its dy < 0 value from the pixel + * y - dy. For dy = 0, compute only pixels to the right, so the negative + * values are obtained from the pixels to the left (analogous to the dy + * case). + */ + for (dy = 0; dy < sr + 1; dy++) { + for (dx = dy == 0 ? 0 : -sr; dx < sr + 1; dx++) { + compute_sdx (priv, cmd_queue, profiler, in_mem, out_mem, dx, dy); + process_shift (priv, cmd_queue, profiler, in_mem, priv->aux_mem[0], out_mem, dx, dy); + } + } + /* Now we have the sum of results and weights, divide them to get the + * result. + */ + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 0, sizeof (cl_mem), &priv->weights_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->divide_kernel, 1, sizeof (cl_mem), &out_mem)); + ufo_profiler_call (profiler, cmd_queue, priv->divide_kernel, 2, priv->cropped_size, NULL); +} + UfoNode * ufo_non_local_means_task_new (void) { @@ -131,11 +446,44 @@ ufo_non_local_means_task_setup (UfoTask *task, cl_int err; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); - priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); + if (priv->fast) { + priv->mse_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "compute_shifted_mse", NULL, error); + priv->cumsum_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "cumsum", NULL, error); + priv->spread_kernel = ufo_resources_get_kernel (resources, "cumsum.cl", "spread_block_sums", NULL, error); + priv->transpose_kernel = ufo_resources_get_kernel (resources, "transpose.cl", "transpose_shared", NULL, error); + priv->weight_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "process_shift", NULL, error); + priv->divide_kernel = ufo_resources_get_kernel (resources, "nlm.cl", "divide_inplace", NULL, error); + } else { + priv->kernel = ufo_resources_get_kernel (resources, "nlm.cl", "nlm_noise_reduction", NULL, error); + } priv->window_mem = NULL; + priv->group_sums = NULL; + priv->weights_mem = NULL; + for (gint i = 0; i < 2; i++) { + priv->aux_mem[i] = NULL; + } - if (priv->kernel) + if (priv->kernel) { UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->kernel), error); + } + if (priv->mse_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->mse_kernel), error); + } + if (priv->cumsum_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->cumsum_kernel), error); + } + if (priv->spread_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->spread_kernel), error); + } + if (priv->transpose_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->transpose_kernel), error); + } + if (priv->weight_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->weight_kernel), error); + } + if (priv->divide_kernel) { + UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainKernel (priv->divide_kernel), error); + } priv->context = ufo_resources_get_context (resources); UFO_RESOURCES_CHECK_SET_AND_RETURN (clRetainContext (priv->context), error); @@ -155,10 +503,35 @@ ufo_non_local_means_task_get_requisition (UfoTask *task, GError **error) { UfoNonLocalMeansTaskPrivate *priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); + UfoGpuNode *node; + GValue *max_work_group_size_gvalue; + ufo_buffer_get_requisition (inputs[0], requisition); + if (priv->use_window && !priv->window_mem) { create_coefficients (priv); } + if (priv->fast) { + if (priv->max_work_group_size == 0) { + node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); + max_work_group_size_gvalue = ufo_gpu_node_get_info (node, UFO_GPU_NODE_INFO_MAX_WORK_GROUP_SIZE); + priv->max_work_group_size = g_value_get_ulong (max_work_group_size_gvalue); + g_value_unset (max_work_group_size_gvalue); + } + if (priv->cropped_size[0] == 0 || priv->cropped_size[0] != requisition->dims[0] || + priv->cropped_size[1] != requisition->dims[1]) { + if (priv->cropped_size[0] != 0) { + /* Size changed, release first. */ + release_fast_buffers (priv); + } + priv->cropped_size[0] = requisition->dims[0]; + priv->cropped_size[1] = requisition->dims[1]; + priv->padding = priv->search_radius + priv->patch_radius + 1; + priv->padded_size[0] = priv->cropped_size[0] + 2 * priv->padding; + priv->padded_size[1] = priv->cropped_size[1] + 2 * priv->padding; + create_fast_buffers (priv); + } + } } static guint @@ -189,30 +562,43 @@ ufo_non_local_means_task_process (UfoTask *task, UfoNonLocalMeansTaskPrivate *priv; UfoGpuNode *node; UfoProfiler *profiler; + UfoRequisition in_req; cl_command_queue cmd_queue; cl_mem in_mem; cl_mem out_mem; gfloat h, var; + gfloat fill_pattern = 0.0f; priv = UFO_NON_LOCAL_MEANS_TASK_GET_PRIVATE (task); node = UFO_GPU_NODE (ufo_task_node_get_proc_node (UFO_TASK_NODE (task))); cmd_queue = ufo_gpu_node_get_cmd_queue (node); - in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); out_mem = ufo_buffer_get_device_array (output, cmd_queue); + profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); + ufo_buffer_get_requisition (inputs[0], &in_req); h = 1 / priv->h / priv->h; var = priv->sigma * priv->sigma; - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); - UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem)); - - profiler = ufo_task_node_get_profiler (UFO_TASK_NODE (task)); - ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); + if (priv->fast) { + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); + UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, out_mem, &fill_pattern, sizeof (gfloat), 0, + sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1], + 0, NULL, NULL)); + UFO_RESOURCES_CHECK_CLERR (clEnqueueFillBuffer (cmd_queue, priv->weights_mem, &fill_pattern, sizeof (gfloat), 0, + sizeof (gfloat) * priv->cropped_size[0] * priv->cropped_size[1], + 0, NULL, NULL)); + compute_nlm_fast (priv, cmd_queue, profiler, in_mem, out_mem); + } else { + in_mem = ufo_buffer_get_device_image (inputs[0], cmd_queue); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 0, sizeof (cl_mem), &in_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 1, sizeof (cl_mem), &out_mem)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 2, sizeof (cl_sampler), &priv->sampler)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 3, sizeof (guint), &priv->search_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 4, sizeof (guint), &priv->patch_radius)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 5, sizeof (gfloat), &h)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 6, sizeof (gfloat), &var)); + UFO_RESOURCES_CHECK_CLERR (clSetKernelArg (priv->kernel, 7, sizeof (cl_mem), &priv->window_mem)); + ufo_profiler_call (profiler, cmd_queue, priv->kernel, 2, requisition->dims, NULL); + } return TRUE; } @@ -242,6 +628,9 @@ ufo_non_local_means_task_set_property (GObject *object, case PROP_WINDOW: priv->use_window = g_value_get_boolean (value); break; + case PROP_FAST: + priv->fast = g_value_get_boolean (value); + break; case PROP_ADDRESSING_MODE: priv->addressing_mode = g_value_get_enum (value); break; @@ -275,6 +664,9 @@ ufo_non_local_means_task_get_property (GObject *object, case PROP_WINDOW: g_value_set_boolean (value, priv->use_window); break; + case PROP_FAST: + g_value_set_boolean (value, priv->fast); + break; case PROP_ADDRESSING_MODE: g_value_set_enum (value, priv->addressing_mode); break; @@ -295,6 +687,30 @@ ufo_non_local_means_task_finalize (GObject *object) UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->kernel)); priv->kernel = NULL; } + if (priv->mse_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->mse_kernel)); + priv->mse_kernel = NULL; + } + if (priv->cumsum_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->cumsum_kernel)); + priv->cumsum_kernel = NULL; + } + if (priv->spread_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->spread_kernel)); + priv->spread_kernel = NULL; + } + if (priv->transpose_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->transpose_kernel)); + priv->transpose_kernel = NULL; + } + if (priv->weight_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->weight_kernel)); + priv->weight_kernel = NULL; + } + if (priv->divide_kernel) { + UFO_RESOURCES_CHECK_CLERR (clReleaseKernel (priv->divide_kernel)); + priv->divide_kernel = NULL; + } if (priv->sampler) { UFO_RESOURCES_CHECK_CLERR (clReleaseSampler (priv->sampler)); priv->sampler = NULL; @@ -304,6 +720,7 @@ ufo_non_local_means_task_finalize (GObject *object) priv->context = NULL; } release_coefficients (priv); + release_fast_buffers (priv); G_OBJECT_CLASS (ufo_non_local_means_task_parent_class)->finalize (object); } @@ -363,6 +780,13 @@ ufo_non_local_means_task_class_init (UfoNonLocalMeansTaskClass *klass) TRUE, G_PARAM_READWRITE); + properties[PROP_FAST] = + g_param_spec_boolean ("fast", + "Use a faster version of the algorithm", + "Use a faster version of the algorithm", + TRUE, + G_PARAM_READWRITE); + properties[PROP_ADDRESSING_MODE] = g_param_spec_enum ("addressing-mode", "Outlier treatment (\"none\", \"clamp\", \"clamp_to_edge\", \"repeat\", \"mirrored_repeat\")", @@ -384,8 +808,15 @@ ufo_non_local_means_task_init(UfoNonLocalMeansTask *self) self->priv->search_radius = 10; self->priv->patch_radius = 3; + self->priv->cropped_size[0] = 0; + self->priv->cropped_size[1] = 0; + self->priv->padded_size[0] = 0; + self->priv->padded_size[1] = 0; + self->priv->padding = -1; self->priv->h = 0.1f; self->priv->sigma = 0.0f; + self->priv->max_work_group_size = 0; self->priv->use_window = TRUE; self->priv->addressing_mode = CL_ADDRESS_MIRRORED_REPEAT; + self->priv->fast = FALSE; } |