diff options
author | Matthias Vogelgesang <matthias.vogelgesang@kit.edu> | 2014-03-06 15:10:44 +0100 |
---|---|---|
committer | Matthias Vogelgesang <matthias.vogelgesang@kit.edu> | 2014-03-06 15:10:44 +0100 |
commit | 9badcf804e399a685cb378a99951f247b1b9039a (patch) | |
tree | 61220df3e10771c5f36d56f31a261fcf73cea259 /deps | |
parent | 949d4f95cad502ee1388476e28505e686c914c52 (diff) | |
download | ufo-filters-9badcf804e399a685cb378a99951f247b1b9039a.tar.gz ufo-filters-9badcf804e399a685cb378a99951f247b1b9039a.tar.bz2 ufo-filters-9badcf804e399a685cb378a99951f247b1b9039a.tar.xz ufo-filters-9badcf804e399a685cb378a99951f247b1b9039a.zip |
Fix #5: integrate oclfft and drop lib dependency
Diffstat (limited to 'deps')
-rw-r--r-- | deps/CMakeLists.txt | 1 | ||||
-rw-r--r-- | deps/oclfft/CMakeLists.txt | 10 | ||||
-rw-r--r-- | deps/oclfft/clFFT.h | 129 | ||||
-rw-r--r-- | deps/oclfft/fft_base_kernels.h | 277 | ||||
-rw-r--r-- | deps/oclfft/fft_execute.cpp | 405 | ||||
-rw-r--r-- | deps/oclfft/fft_internal.h | 161 | ||||
-rw-r--r-- | deps/oclfft/fft_kernelstring.cpp | 1256 | ||||
-rw-r--r-- | deps/oclfft/fft_setup.cpp | 389 | ||||
-rw-r--r-- | deps/oclfft/procs.h | 53 |
9 files changed, 2681 insertions, 0 deletions
diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 0000000..52b844f --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(oclfft) diff --git a/deps/oclfft/CMakeLists.txt b/deps/oclfft/CMakeLists.txt new file mode 100644 index 0000000..a53c551 --- /dev/null +++ b/deps/oclfft/CMakeLists.txt @@ -0,0 +1,10 @@ +include_directories(${OPENCL_INCLUDE_DIRS}) + +add_library(oclfft STATIC + fft_execute.cpp + fft_setup.cpp + fft_kernelstring.cpp) + +set_target_properties(oclfft PROPERTIES LINKER_LANGUAGE C) + +target_link_libraries(oclfft ${OPENCL_LIBRARIES}) diff --git a/deps/oclfft/clFFT.h b/deps/oclfft/clFFT.h new file mode 100644 index 0000000..e893d95 --- /dev/null +++ b/deps/oclfft/clFFT.h @@ -0,0 +1,129 @@ + +// +// File: clFFT.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_H +#define __CLFFT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <CL/cl.h> +#include <stdio.h> + +// XForm type +typedef enum +{ + clFFT_Forward = -1, + clFFT_Inverse = 1 + +}clFFT_Direction; + +// XForm dimension +typedef enum +{ + clFFT_1D = 0, + clFFT_2D = 1, + clFFT_3D = 3 + +}clFFT_Dimension; + +// XForm Data type +typedef enum +{ + clFFT_SplitComplexFormat = 0, + clFFT_InterleavedComplexFormat = 1 +}clFFT_DataFormat; + +typedef struct +{ + unsigned int x; + unsigned int y; + unsigned int z; +}clFFT_Dim3; + +typedef struct +{ + float *real; + float *imag; +} clFFT_SplitComplex; + +typedef struct +{ + float real; + float imag; +}clFFT_Complex; + +typedef void* clFFT_Plan; + +clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); + +void clFFT_DestroyPlan( clFFT_Plan plan ); + +cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + + +cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + +void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/deps/oclfft/fft_base_kernels.h b/deps/oclfft/fft_base_kernels.h new file mode 100644 index 0000000..1017956 --- /dev/null +++ b/deps/oclfft/fft_base_kernels.h @@ -0,0 +1,277 @@ + +// +// File: fft_base_kernels.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CL_FFT_BASE_KERNELS_ +#define __CL_FFT_BASE_KERNELS_ + +#include <string> + +using namespace std; + +static string baseKernels = string( + "#ifndef M_PI\n" + "#define M_PI 0x1.921fb54442d18p+1\n" + "#endif\n" + "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" + "#define conj(a) ((float2)((a).x, -(a).y))\n" + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" + "\n" + "#define fftKernel2(a,dir) \\\n" + "{ \\\n" + " float2 c = (a)[0]; \\\n" + " (a)[0] = c + (a)[1]; \\\n" + " (a)[1] = c - (a)[1]; \\\n" + "}\n" + "\n" + "#define fftKernel2S(d1,d2,dir) \\\n" + "{ \\\n" + " float2 c = (d1); \\\n" + " (d1) = c + (d2); \\\n" + " (d2) = c - (d2); \\\n" + "}\n" + "\n" + "#define fftKernel4(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " float2 c = (a)[1]; \\\n" + " (a)[1] = (a)[2]; \\\n" + " (a)[2] = c; \\\n" + "}\n" + "\n" + "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" + "{ \\\n" + " fftKernel2S((a0), (a2), dir); \\\n" + " fftKernel2S((a1), (a3), dir); \\\n" + " fftKernel2S((a0), (a1), dir); \\\n" + " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" + " fftKernel2S((a2), (a3), dir); \\\n" + " float2 c = (a1); \\\n" + " (a1) = (a2); \\\n" + " (a2) = c; \\\n" + "}\n" + "\n" + "#define bitreverse8(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; \\\n" + " (a)[1] = (a)[4]; \\\n" + " (a)[4] = c; \\\n" + " c = (a)[3]; \\\n" + " (a)[3] = (a)[6]; \\\n" + " (a)[6] = c; \\\n" + "}\n" + "\n" + "#define fftKernel8(a,dir) \\\n" + "{ \\\n" + " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " float2 c; \\\n" + " fftKernel2S((a)[0], (a)[4], dir); \\\n" + " fftKernel2S((a)[1], (a)[5], dir); \\\n" + " fftKernel2S((a)[2], (a)[6], dir); \\\n" + " fftKernel2S((a)[3], (a)[7], dir); \\\n" + " (a)[5] = complexMul(w1, (a)[5]); \\\n" + " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" + " (a)[7] = complexMul(w3, (a)[7]); \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[6], dir); \\\n" + " fftKernel2S((a)[5], (a)[7], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[5], dir); \\\n" + " fftKernel2S((a)[6], (a)[7], dir); \\\n" + " bitreverse8((a)); \\\n" + "}\n" + "\n" + "#define bitreverse4x4(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" + " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" + " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" + " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" + " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" + " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" + "}\n" + "\n" + "#define fftKernel16(a,dir) \\\n" + "{ \\\n" + " const float w0 = 0x1.d906bcp-1f; \\\n" + " const float w1 = 0x1.87de2ap-2f; \\\n" + " const float w2 = 0x1.6a09e6p-1f; \\\n" + " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" + " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" + " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" + " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" + " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" + " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" + " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" + " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" + " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" + " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" + " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" + " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" + " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" + " fftKernel4((a), dir); \\\n" + " fftKernel4((a) + 4, dir); \\\n" + " fftKernel4((a) + 8, dir); \\\n" + " fftKernel4((a) + 12, dir); \\\n" + " bitreverse4x4((a)); \\\n" + "}\n" + "\n" + "#define bitreverse32(a) \\\n" + "{ \\\n" + " float2 c1, c2; \\\n" + " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" + " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" + " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" + " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" + " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" + " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" + "}\n" + "\n" + "#define fftKernel32(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[16], dir); \\\n" + " fftKernel2S((a)[1], (a)[17], dir); \\\n" + " fftKernel2S((a)[2], (a)[18], dir); \\\n" + " fftKernel2S((a)[3], (a)[19], dir); \\\n" + " fftKernel2S((a)[4], (a)[20], dir); \\\n" + " fftKernel2S((a)[5], (a)[21], dir); \\\n" + " fftKernel2S((a)[6], (a)[22], dir); \\\n" + " fftKernel2S((a)[7], (a)[23], dir); \\\n" + " fftKernel2S((a)[8], (a)[24], dir); \\\n" + " fftKernel2S((a)[9], (a)[25], dir); \\\n" + " fftKernel2S((a)[10], (a)[26], dir); \\\n" + " fftKernel2S((a)[11], (a)[27], dir); \\\n" + " fftKernel2S((a)[12], (a)[28], dir); \\\n" + " fftKernel2S((a)[13], (a)[29], dir); \\\n" + " fftKernel2S((a)[14], (a)[30], dir); \\\n" + " fftKernel2S((a)[15], (a)[31], dir); \\\n" + " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" + " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " fftKernel16((a), dir); \\\n" + " fftKernel16((a) + 16, dir); \\\n" + " bitreverse32((a)); \\\n" + "}\n\n" + ); + +static string twistKernelInterleaved = string( + "__kernel void \\\n" + "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = in[startIndex]; \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in[startIndex] = a; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); + +static string twistKernelPlannar = string( + "__kernel void \\\n" + "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in_real[startIndex] = a.x; \\\n" + " in_imag[startIndex] = a.y; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); + + + +#endif diff --git a/deps/oclfft/fft_execute.cpp b/deps/oclfft/fft_execute.cpp new file mode 100644 index 0000000..64dacdf --- /dev/null +++ b/deps/oclfft/fft_execute.cpp @@ -0,0 +1,405 @@ + +// +// File: fft_execute.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software.¬ +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "clFFT.h" +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#define max(a,b) (((a)>(b)) ? (a) : (b)) +#define min(a,b) (((a)<(b)) ? (a) : (b)) + +static cl_int +allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); + + if(plan->tempmemobj) + clReleaseMemObject(plan->tempmemobj); + + plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + } + return err; +} + +static cl_int +allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + cl_int terr; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); + + if(plan->tempmemobj_real) + clReleaseMemObject(plan->tempmemobj_real); + + if(plan->tempmemobj_imag) + clReleaseMemObject(plan->tempmemobj_imag); + + plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); + err |= terr; + } + return err; +} + +void +getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) +{ + *lWorkItems = kernelInfo->num_workitems_per_workgroup; + int numWorkGroups = kernelInfo->num_workgroups; + int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; + + switch(kernelInfo->dir) + { + case cl_fft_kernel_x: + *batchSize *= (plan->n.y * plan->n.z); + numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); + numWorkGroups *= kernelInfo->num_workgroups; + break; + case cl_fft_kernel_y: + *batchSize *= plan->n.z; + numWorkGroups *= *batchSize; + break; + case cl_fft_kernel_z: + numWorkGroups *= *batchSize; + break; + } + + *gWorkItems = numWorkGroups * *lWorkItems; +} + +cl_int +clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + if(plan->format != clFFT_InterleavedComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = data_in == data_out ? 1 : 0; + + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj[3]; + memObj[0] = data_in; + memObj[1] = data_out; + memObj[2] = plan->tempmemobj; + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + // all kernels can execute in-place. + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + if(plan->format != clFFT_SplitComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; + + if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj_real[3]; + cl_mem memObj_imag[3]; + memObj_real[0] = data_in_real; + memObj_real[1] = data_out_real; + memObj_real[2] = plan->tempmemobj_real; + memObj_imag[0] = data_in_imag; + memObj_imag[1] = data_out_imag; + memObj_imag[2] = plan->tempmemobj_imag; + + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + +cl_int +clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + diff --git a/deps/oclfft/fft_internal.h b/deps/oclfft/fft_internal.h new file mode 100644 index 0000000..b307d5d --- /dev/null +++ b/deps/oclfft/fft_internal.h @@ -0,0 +1,161 @@ + +// +// File: fft_internal.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_INTERNAL_H +#define __CLFFT_INTERNAL_H + +#include "clFFT.h" +#include <sstream> + +using namespace std; + +typedef enum kernel_dir_t +{ + cl_fft_kernel_x, + cl_fft_kernel_y, + cl_fft_kernel_z +}cl_fft_kernel_dir; + +typedef struct kernel_info_t +{ + cl_kernel kernel; + char *kernel_name; + size_t lmem_size; + size_t num_workgroups; + size_t num_xforms_per_workgroup; + size_t num_workitems_per_workgroup; + cl_fft_kernel_dir dir; + int in_place_possible; + kernel_info_t *next; +}cl_fft_kernel_info; + +typedef struct +{ + // context in which fft resources are created and kernels are executed + cl_context context; + + // size of signal + clFFT_Dim3 n; + + // dimension of transform ... must be either 1D, 2D or 3D + clFFT_Dimension dim; + + // data format ... must be either interleaved or plannar + clFFT_DataFormat format; + + // string containing kernel source. Generated at runtime based on + // n, dim, format and other parameters + string *kernel_string; + + // CL program containing source and kernel this particular + // n, dim, data format + cl_program program; + + // linked list of kernels which needs to be executed for this fft + cl_fft_kernel_info *kernel_info; + + // number of kernels + int num_kernels; + + // twist kernel for virtualizing fft of very large sizes that do not + // fit in GPU global memory + cl_kernel twist_kernel; + + // flag indicating if temporary intermediate buffer is needed or not. + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // one that does not require global transpose do not need temporary buffer) + // 2D 1024x1024 out-of-place fft however do require intermediate buffer. + // If temp buffer is needed, its allocation is lazy i.e. its not allocated + // until its needed + cl_int temp_buffer_needed; + + // Batch size is runtime parameter and size of temporary buffer (if needed) + // depends on batch size. Allocation of temporary buffer is lazy i.e. its + // only created when needed. Once its created at first call of clFFT_Executexxx + // it is not allocated next time if next time clFFT_Executexxx is called with + // batch size different than the first call. last_batch_size caches the last + // batch size with which this plan is used so that we dont keep allocating/deallocating + // temp buffer if same batch size is used again and again. + size_t last_batch_size; + + // temporary buffer for interleaved plan + cl_mem tempmemobj; + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + // data format of plan (plannar or interleaved) + cl_mem tempmemobj_real, tempmemobj_imag; + + // Maximum size of signal for which local memory transposed based + // fft is sufficient i.e. no global mem transpose (communication) + // is needed + size_t max_localmem_fft_size; + + // Maximum work items per work group allowed. This, along with max_radix below controls + // maximum local memory being used by fft kernels of this plan. Set to 256 by default + size_t max_work_item_per_workgroup; + + // Maximum base radix for local memory fft ... this controls the maximum register + // space used by work items. Currently defaults to 16 + size_t max_radix; + + // Device depended parameter that tells how many work-items need to be read consecutive + // values to make sure global memory access by work-items of a work-group result in + // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 + size_t min_mem_coalesce_width; + + // Number of local memory banks. This is used to geneate kernel with local memory + // transposes with appropriate padding to avoid bank conflicts to local memory + // e.g. on NVidia it is 16. + size_t num_local_mem_banks; +}cl_fft_plan; + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); + +#endif diff --git a/deps/oclfft/fft_kernelstring.cpp b/deps/oclfft/fft_kernelstring.cpp new file mode 100644 index 0000000..116294f --- /dev/null +++ b/deps/oclfft/fft_kernelstring.cpp @@ -0,0 +1,1256 @@ + +// +// File: fft_kernelstring.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <iostream> +#include <sstream> +#include <string.h> +#include <assert.h> +#include "fft_internal.h" +#include "clFFT.h" + +using namespace std; + +#define max(A,B) ((A) > (B) ? (A) : (B)) +#define min(A,B) ((A) < (B) ? (A) : (B)) + +static string +num2str(int num) +{ + char temp[200]; + sprintf(temp, "%d", num); + return string(temp); +} + +// For any n, this function decomposes n into factors for loacal memory tranpose +// based fft. Factors (radices) are sorted such that the first one (radixArray[0]) +// is the largest. This base radix determines the number of registers used by each +// work item and product of remaining radices determine the size of work group needed. +// To make things concrete with and example, suppose n = 1024. It is decomposed into +// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and +// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length +// 16 ffts (64 work items working in parallel) following by transpose using local +// memory followed by again 64 length 16 ffts followed by transpose using local memory +// followed by 256 length 4 ffts. For the last step since with size of work group is +// 64 and each work item can array for 16 values, 64 work items can compute 256 length +// 4 ffts by each work item computing 4 length 4 ffts. +// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work +// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 512 length 4 in-register ffts. Again, +// for the last step, each work item computes two length 4 in-register ffts and thus +// 256 work items are needed to compute all 512 ffts. +// For n = 32 = 8 x 4, 4 work items first compute 4 in-register +// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register +// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items +// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, +// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). +// Users can play with these parameters to figure what gives best performance on +// their particular device i.e. some device have less register space thus using +// smaller base radix can avoid spilling ... some has small local memory thus +// using smaller work group size may be required etc + +static void +getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix) +{ + if(maxRadix > 1) + { + maxRadix = min(n, maxRadix); + unsigned int cnt = 0; + while(n > maxRadix) + { + radixArray[cnt++] = maxRadix; + n /= maxRadix; + } + radixArray[cnt++] = n; + *numRadices = cnt; + return; + } + + switch(n) + { + case 2: + *numRadices = 1; + radixArray[0] = 2; + break; + + case 4: + *numRadices = 1; + radixArray[0] = 4; + break; + + case 8: + *numRadices = 1; + radixArray[0] = 8; + break; + + case 16: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 2; + break; + + case 32: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 4; + break; + + case 64: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 8; + break; + + case 128: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; + break; + + case 256: + *numRadices = 4; + radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; + break; + + case 512: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; + break; + + case 1024: + *numRadices = 3; + radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; + break; + case 2048: + *numRadices = 4; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; + break; + default: + *numRadices = 0; + return; + } +} + +static void +insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_SplitComplexFormat) + kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); + else + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); +} + +static void +insertVariables(string &kStream, int maxRadix) +{ + kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); + kStream += string(" int s, ii, jj, offset;\n"); + kStream += string(" float2 w;\n"); + kStream += string(" float ang, angf, ang1;\n"); + kStream += string(" __local float *lMemStore, *lMemLoad;\n"); + kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n"); + kStream += string(" int lId = get_local_id( 0 );\n"); + kStream += string(" int groupId = get_group_id( 0 );\n"); +} + +static void +formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); + else + { + kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); + kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); + } +} + +static void +formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); + else + { + kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); + kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); + } +} + +static int +insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j; + int lMemSize = 0; + + if(numXFormsPerWG > 1) + kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); + + if(numWorkItemsPerXForm >= mem_coalesce_width) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + kernelString += string(" }\n"); + } + else + { + kernelString += string(" ii = lId;\n"); + kernelString += string(" jj = 0;\n"); + kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + } + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); + kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n "); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + } + kernelString += string("}\n"); + + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].x;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].y;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < R0; i++ ) + { + kernelString += string(" if(jj < s )\n"); + formattedLoad(kernelString, i, i*groupSize, dataFormat); + if(i != R0 - 1) + kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < R0; i++ ) + { + formattedLoad(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + if(numWorkItemsPerXForm > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + } + else + { + kernelString += string(" ii = 0;\n"); + kernelString += string(" jj = lId;\n"); + kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); + } + + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static int +insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j, k, ind; + int lMemSize = 0; + int numIter = maxRadix / Nr; + string indent = string(""); + + if( numWorkItemsPerXForm >= mem_coalesce_width ) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + indent = string(" "); + } + for(i = 0; i < maxRadix; i++) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); + } + if(numXFormsPerWG > 1) + kernelString += string(" }\n"); + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < maxRadix; i++ ) + { + kernelString += string(" if(jj < s ) {\n"); + formattedStore(kernelString, i, i*groupSize, dataFormat); + kernelString += string(" }\n"); + if( i != maxRadix - 1) + kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < maxRadix; i++ ) + { + formattedStore(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static void +insertfftKernel(string &kernelString, int Nr, int numIter) +{ + int i; + for(i = 0; i < numIter; i++) + { + kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); + } +} + +static void +insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) +{ + int z, k; + int logNPrev = log2(Nprev); + + for(z = 0; z < numIter; z++) + { + if(z == 0) + { + if(Nprev > 1) + kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) ii;\n"); + } + else + { + if(Nprev > 1) + kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); + } + + for(k = 1; k < Nr; k++) { + int ind = z*Nr + k; + //float fac = (float) (2.0 * M_PI * (double) k / (double) len); + kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); + kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); + kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); + } + } +} + +static int +getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) +{ + if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) + *offset = 0; + else { + int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; + int numColsReq = 1; + if(numRowsReq > Nr) + numColsReq = numRowsReq / Nr; + numColsReq = Nprev * numColsReq; + *offset = numColsReq; + } + + if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) + *midPad = 0; + else { + int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); + if( bankNum >= numWorkItemsPerXForm ) + *midPad = 0; + else + *midPad = numWorkItemsPerXForm - bankNum; + } + + int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); + return lMemSize; +} + + +static void +insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int z, k; + + for(z = 0; z < numIter; z++) { + for(k = 0; k < Nr; k++) { + int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; + kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int numWorkItemsReqN = n / Nrn; + int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); + int interBlockHStride = numWorkItemsPerXForm; + int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); + vertWidth = min( vertWidth, Nr); + int vertNum = Nr / vertWidth; + int vertStride = ( n / Nr + offset ) * vertWidth; + int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); + int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; + intraBlockHStride *= Nprev; + + int stride = numWorkItemsReq / Nrn; + int i; + for(i = 0; i < iter; i++) { + int ii = i / (interBlockHNum * vertNum); + int zz = i % (interBlockHNum * vertNum); + int jj = zz % interBlockHNum; + int kk = zz / interBlockHNum; + int z; + for(z = 0; z < Nrn; z++) { + int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; + kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) +{ + int Ncurr = Nprev * Nr; + int logNcurr = log2(Ncurr); + int logNprev = log2(Nprev); + int incr = (numWorkItemsReq + offset) * Nr + midPad; + + if(Ncurr < numWorkItemsPerXForm) + { + if(Nprev == 1) + kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); + else + kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); + + if(Nprev == 1) + kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); + else + kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); + } + else + { + if(Nprev == 1) + kernelString += string(" j = ii;\n"); + else + kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); + if(Nprev == 1) + kernelString += string(" i = 0;\n"); + else + kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); + } + + if(numXFormsPerWG > 1) + kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); + + kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); +} + +static void +insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) +{ + if(numXFormsPerWG == 1) { + kernelString += string(" lMemStore = sMem + ii;\n"); + } + else { + kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); + } +} + + +static void +createLocalMemfftKernelString(cl_fft_plan *plan) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + unsigned int n = plan->n.x; + + assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); + + getRadixArray(n, radixArray, &numRadix, 0); + assert(numRadix > 0 && "no radix array supplied\n"); + + if(n/radixArray[0] > plan->max_work_item_per_workgroup) + getRadixArray(n, radixArray, &numRadix, plan->max_radix); + + assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); + assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); + + unsigned int tmpLen = 1; + unsigned int i; + for(i = 0; i < numRadix; i++) + { + assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); + tmpLen *= radixArray[i]; + } + assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); + + int offset, midPad; + string localString(""), kernelName(""); + + clFFT_DataFormat dataFormat = plan->format; + string *kernelString = plan->kernel_string; + + + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + kernelName = string("fft") + num2str(kCount); + + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + (*kInfo)->lmem_size = 0; + (*kInfo)->num_workgroups = 0; + (*kInfo)->num_workitems_per_workgroup = 0; + (*kInfo)->dir = cl_fft_kernel_x; + (*kInfo)->in_place_possible = 1; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + unsigned int numWorkItemsPerXForm = n / radixArray[0]; + unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; + assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); + int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; + (*kInfo)->num_workgroups = 1; + (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; + (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; + + unsigned int *N = radixArray; + unsigned int maxRadix = N[0]; + unsigned int lMemSize = 0; + + insertVariables(localString, maxRadix); + + lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + string xcomp = string("x"); + string ycomp = string("y"); + + unsigned int Nprev = 1; + unsigned int len = n; + unsigned int r; + for(r = 0; r < numRadix; r++) + { + int numIter = N[0] / N[r]; + int numWorkItemsReq = n / N[r]; + int Ncurr = Nprev * N[r]; + insertfftKernel(localString, N[r], numIter); + + if(r < (numRadix - 1)) { + insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); + lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); + insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + Nprev = Ncurr; + len = len / N[r]; + } + } + + lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); +} + +// For n larger than what can be computed using local memory fft, global transposes +// multiple kernel launces is needed. For these sizes, n can be decomposed using +// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel +// launches will be needed, first computing 64 x 32, length 128 ffts, second computing +// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. +// Each of these base radices can futher be divided into factors so that each of these +// base ffts can be computed within one kernel launch using in-register ffts and local +// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length +// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length +// 16 ffts followed by transpose using local memory followed by each of these eight +// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This +// means only 8 work items are needed for computing one length 128 fft. If we choose +// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one +// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this +// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each +// work group where each work group is computing 8 length 128 ffts where each length +// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels +// in this example. Users can play with difference base radices and difference +// decompositions of base radices to generates different kernels and see which gives +// best performance. Following function is just fixed to use 128 as base radix + +void +getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) +{ + int baseRadix = min(n, 128); + + int numR = 0; + int N = n; + while(N > baseRadix) + { + N /= baseRadix; + numR++; + } + + for(int i = 0; i < numR; i++) + radix[i] = baseRadix; + + radix[numR] = N; + numR++; + *numRadices = numR; + + for(int i = 0; i < numR; i++) + { + int B = radix[i]; + if(B <= 8) + { + R1[i] = B; + R2[i] = 1; + continue; + } + + int r1 = 2; + int r2 = B / r1; + while(r2 > r1) + { + r1 *=2; + r2 = B / r1; + } + R1[i] = r1; + R2[i] = r2; + } +} + +static void +createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) +{ + int i, j, k, t; + int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int radix, R1, R2; + int numRadices; + + int maxThreadsPerBlock = plan->max_work_item_per_workgroup; + int maxArrayLen = plan->max_radix; + int batchSize = plan->min_mem_coalesce_width; + clFFT_DataFormat dataFormat = plan->format; + int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; + + getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); + + int numPasses = numRadices; + + string localString(""), kernelName(""); + string *kernelString = plan->kernel_string; + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + int N = n; + int m = (int)log2(n); + int Rinit = vertical ? BS : 1; + batchSize = vertical ? min(BS, batchSize) : batchSize; + int passNum; + + for(passNum = 0; passNum < numPasses; passNum++) + { + + localString.clear(); + kernelName.clear(); + + radix = radixArr[passNum]; + R1 = R1Arr[passNum]; + R2 = R2Arr[passNum]; + + int strideI = Rinit; + for(i = 0; i < numPasses; i++) + if(i != passNum) + strideI *= radixArr[i]; + + int strideO = Rinit; + for(i = 0; i < passNum; i++) + strideO *= radixArr[i]; + + int threadsPerXForm = R2; + batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; + batchSize = min(batchSize, strideI); + int threadsPerBlock = batchSize * threadsPerXForm; + threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); + batchSize = threadsPerBlock / threadsPerXForm; + assert(R2 <= R1); + assert(R1*R2 == radix); + assert(R1 <= maxArrayLen); + assert(threadsPerBlock <= maxThreadsPerBlock); + + int numIter = R1 / R2; + int gInInc = threadsPerBlock / batchSize; + + + int lgStrideO = log2(strideO); + int numBlocksPerXForm = strideI / batchSize; + int numBlocks = numBlocksPerXForm; + if(!vertical) + numBlocks *= BS; + else + numBlocks *= vertBS; + + kernelName = string("fft") + num2str(kCount); + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + if(R2 == 1) + (*kInfo)->lmem_size = 0; + else + { + if(strideO == 1) + (*kInfo)->lmem_size = (radix + 1)*batchSize; + else + (*kInfo)->lmem_size = threadsPerBlock*R1; + } + (*kInfo)->num_workgroups = numBlocks; + (*kInfo)->num_xforms_per_workgroup = 1; + (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; + (*kInfo)->dir = dir; + if( (passNum == (numPasses - 1)) && (numPasses & 1) ) + (*kInfo)->in_place_possible = 1; + else + (*kInfo)->in_place_possible = 0; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + insertVariables(localString, R1); + + if(vertical) + { + localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); + localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); + localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); + localString += string("bNum = groupId;\n"); + } + else + { + int lgNumBlocksPerXForm = log2(numBlocksPerXForm); + localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); + localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); + localString += string("tid = indexIn;\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); + localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); + localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); + } + + // Load Data + int lgBatchSize = log2(batchSize); + localString += string("tid = lId;\n"); + localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); + localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); + localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); + + if(dataFormat == clFFT_SplitComplexFormat) + { + localString += string("in_real += indexIn;\n"); + localString += string("in_imag += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); + } + else + { + localString += string("in += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); + } + + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); + + if(R2 > 1) + { + // twiddle + for(k = 1; k < R1; k++) + { + localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); + } + + // shuffle + numIter = R1 / R2; + localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); + localString += string("lMemStore = sMem + tid;\n"); + localString += string("lMemLoad = sMem + indexIn;\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(j = 0; j < numIter; j++) + localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); + } + + // twiddle + if(passNum < (numPasses - 1)) + { + localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); + localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); + for(t = 0; t < R1; t++) + { + localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); + } + } + + // Store Data + if(strideO == 1) + { + + localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); + localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); + + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(int i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(int i = 0; i < outerIter; i++) + for(int j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(int i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(int i = 0; i < outerIter; i++) + for(int j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + localString += string("indexOut += tid;\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); + } + + } + else + { + localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); + } + } + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); + + N /= radix; + kInfo = &(*kInfo)->next; + kCount++; + } +} + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + switch(dir) + { + case cl_fft_kernel_x: + if(plan->n.x > plan->max_localmem_fft_size) + { + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + else if(plan->n.x > 1) + { + getRadixArray(plan->n.x, radixArray, &numRadix, 0); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + { + createLocalMemfftKernelString(plan); + } + else + { + getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + createLocalMemfftKernelString(plan); + else + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + } + break; + + case cl_fft_kernel_y: + if(plan->n.y > 1) + createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); + break; + + case cl_fft_kernel_z: + if(plan->n.z > 1) + createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); + default: + return; + } +} + diff --git a/deps/oclfft/fft_setup.cpp b/deps/oclfft/fft_setup.cpp new file mode 100644 index 0000000..eeecfa7 --- /dev/null +++ b/deps/oclfft/fft_setup.cpp @@ -0,0 +1,389 @@ + +// +// File: fft_setup.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "fft_base_kernels.h" +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <iostream> +#include <string> +#include <sstream> + +using namespace std; + +extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems); + +static void +getBlockConfigAndKernelString(cl_fft_plan *plan) +{ + plan->temp_buffer_needed = 0; + *plan->kernel_string += baseKernels; + + if(plan->format == clFFT_SplitComplexFormat) + *plan->kernel_string += twistKernelPlannar; + else + *plan->kernel_string += twistKernelInterleaved; + + switch(plan->dim) + { + case clFFT_1D: + FFT1D(plan, cl_fft_kernel_x); + break; + + case clFFT_2D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + break; + + case clFFT_3D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); + break; + + default: + return; + } + + plan->temp_buffer_needed = 0; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->temp_buffer_needed |= !kInfo->in_place_possible; + kInfo = kInfo->next; + } +} + + +static void +deleteKernelInfo(cl_fft_kernel_info *kInfo) +{ + if(kInfo) + { + if(kInfo->kernel_name) + free(kInfo->kernel_name); + if(kInfo->kernel) + clReleaseKernel(kInfo->kernel); + free(kInfo); + } +} + +static void +destroy_plan(cl_fft_plan *Plan) +{ + cl_fft_kernel_info *kernel_info = Plan->kernel_info; + + while(kernel_info) + { + cl_fft_kernel_info *tmp = kernel_info->next; + deleteKernelInfo(kernel_info); + kernel_info = tmp; + } + + Plan->kernel_info = NULL; + + if(Plan->kernel_string) + { + delete Plan->kernel_string; + Plan->kernel_string = NULL; + } + if(Plan->twist_kernel) + { + clReleaseKernel(Plan->twist_kernel); + Plan->twist_kernel = NULL; + } + if(Plan->program) + { + clReleaseProgram(Plan->program); + Plan->program = NULL; + } + if(Plan->tempmemobj) + { + clReleaseMemObject(Plan->tempmemobj); + Plan->tempmemobj = NULL; + } + if(Plan->tempmemobj_real) + { + clReleaseMemObject(Plan->tempmemobj_real); + Plan->tempmemobj_real = NULL; + } + if(Plan->tempmemobj_imag) + { + clReleaseMemObject(Plan->tempmemobj_imag); + Plan->tempmemobj_imag = NULL; + } +} + +static int +createKernelList(cl_fft_plan *plan) +{ + cl_program program = plan->program; + cl_fft_kernel_info *kernel_info = plan->kernel_info; + + cl_int err; + while(kernel_info) + { + kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); + if(!kernel_info->kernel || err != CL_SUCCESS) + return err; + kernel_info = kernel_info->next; + } + + if(plan->format == clFFT_SplitComplexFormat) + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); + else + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); + + if(!plan->twist_kernel || err) + return err; + + return CL_SUCCESS; +} + +int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) +{ + int reg_needed = 0; + *max_wg_size = INT_MAX; + int err; + size_t wg_size; + + unsigned int i; + for(i = 0; i < num_devices; i++) + { + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); + if(err != CL_SUCCESS) + return -1; + + if(wg_size < kInfo->num_workitems_per_workgroup) + reg_needed |= 1; + + if(*max_wg_size > wg_size) + *max_wg_size = wg_size; + + kInfo = kInfo->next; + } + } + + return reg_needed; +} + +#define ERR_MACRO(err) { \ + if( err != CL_SUCCESS) \ + { \ + if(error_code) \ + *error_code = err; \ + clFFT_DestroyPlan((clFFT_Plan) plan); \ + return (clFFT_Plan) NULL; \ + } \ + } + +clFFT_Plan +clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) +{ + int i; + cl_int err; + int isPow2 = 1; + cl_fft_plan *plan = NULL; + ostringstream kString; + int num_devices; + int gpu_found = 0; + cl_device_id devices[16]; + size_t ret_size; + cl_device_type device_type; + + if(!context) + ERR_MACRO(CL_INVALID_VALUE); + + isPow2 |= n.x && !( (n.x - 1) & n.x ); + isPow2 |= n.y && !( (n.y - 1) & n.y ); + isPow2 |= n.z && !( (n.z - 1) & n.z ); + + if(!isPow2) + ERR_MACRO(CL_INVALID_VALUE); + + if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) + ERR_MACRO(CL_INVALID_VALUE); + + plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); + if(!plan) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + plan->context = context; + clRetainContext(context); + plan->n = n; + plan->dim = dim; + plan->format = dataFormat; + plan->kernel_info = 0; + plan->num_kernels = 0; + plan->twist_kernel = 0; + plan->program = 0; + plan->temp_buffer_needed = 0; + plan->last_batch_size = 0; + plan->tempmemobj = 0; + plan->tempmemobj_real = 0; + plan->tempmemobj_imag = 0; + plan->max_localmem_fft_size = 2048; + plan->max_work_item_per_workgroup = 256; + plan->max_radix = 16; + plan->min_mem_coalesce_width = 16; + plan->num_local_mem_banks = 16; + +patch_kernel_source: + + plan->kernel_string = new string(""); + if(!plan->kernel_string) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + getBlockConfigAndKernelString(plan); + + const char *source_str = plan->kernel_string->c_str(); + plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); + ERR_MACRO(err); + + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); + ERR_MACRO(err); + + num_devices = ret_size / sizeof(cl_device_id); + + err = clBuildProgram(plan->program, num_devices, devices, "-cl-mad-enable", NULL, NULL); + if (err != CL_SUCCESS) + { + char *build_log; + char devicename[200]; + size_t log_size; + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + ERR_MACRO(err); + + build_log = (char *) malloc(log_size + 1); + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + ERR_MACRO(err); + + err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); + ERR_MACRO(err); + + fprintf(stdout, "FFT program build log on device %s\n", devicename); + fprintf(stdout, "%s\n", build_log); + free(build_log); + + ERR_MACRO(err); + } + + err = createKernelList(plan); + ERR_MACRO(err); + + // we created program and kernels based on "some max work group size (default 256)" ... this work group size + // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source + // setting this as limit i.e max group size and rebuild. + unsigned int max_kernel_wg_size; + int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); + if(patching_req == -1) + { + ERR_MACRO(err); + } + + if(patching_req) + { + destroy_plan(plan); + plan->max_work_item_per_workgroup = max_kernel_wg_size; + goto patch_kernel_source; + } + + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->num_kernels++; + kInfo = kInfo->next; + } + + if(error_code) + *error_code = CL_SUCCESS; + + return (clFFT_Plan) plan; +} + +void +clFFT_DestroyPlan(clFFT_Plan plan) +{ + cl_fft_plan *Plan = (cl_fft_plan *) plan; + if(Plan) + { + destroy_plan(Plan); + clReleaseContext(Plan->context); + free(Plan); + } +} + +void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) +{ + size_t gDim, lDim; + FILE *out; + if(!file) + out = stdout; + else + out = file; + + cl_fft_plan *plan = (cl_fft_plan *) Plan; + cl_fft_kernel_info *kInfo = plan->kernel_info; + + while(kInfo) + { + cl_int s = 1; + getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); + fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); + kInfo = kInfo->next; + } + fprintf(out, "%s\n", plan->kernel_string->c_str()); +} diff --git a/deps/oclfft/procs.h b/deps/oclfft/procs.h new file mode 100644 index 0000000..f6028c9 --- /dev/null +++ b/deps/oclfft/procs.h @@ -0,0 +1,53 @@ + +// +// File: procs.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#define test_start() +#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, _higherBetter?"higher is better":"lower is better" , _number) +#define log_info printf +#define log_error printf +#define test_finish() |