From 6e60b7802bb0369cc9dd8b1715073a1ff3c18f03 Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo.paskino@gmail.com>
Date: Fri, 24 Jan 2020 13:59:09 +0000
Subject: axpby as concrete method in DataContainer and BlockDataContainer
 (#489)

* axpby as concrete method in DataContainer and BlockDataContainer

* fixed axpby and added unittest

* PDHG to use axpby

* pass num_threads to axpby

* void commit

* add seed to random in test

* NUM_THREADS can be imported from ccpi.utilities

* added test to axpby with num_threads
---
 .../Python/ccpi/framework/BlockDataContainer.py    |  47 ++++++-
 Wrappers/Python/ccpi/framework/framework.py        |  31 +++--
 .../Python/ccpi/optimisation/algorithms/PDHG.py    |  16 ++-
 .../optimisation/operators/GradientOperator.py     |   7 +-
 Wrappers/Python/ccpi/utilities/__init__.py         |  22 +++
 Wrappers/Python/test/test_BlockDataContainer.py    | 155 +++++++++++++++++++++
 Wrappers/Python/test/test_DataContainer.py         |  15 +-
 Wrappers/Python/test/test_Gradient.py              |   4 +-
 src/Core/CMakeLists.txt                            |   5 +-
 src/Core/FiniteDifferenceLibrary.c                 |  11 --
 src/Core/axpby.c                                   |  84 +----------
 src/Core/include/FiniteDifferenceLibrary.h         |   3 +-
 src/Core/include/axpby.h                           |  13 +-
 src/Core/include/utilities.h                       |   3 +
 src/Core/utilities.c                               |  14 ++
 15 files changed, 300 insertions(+), 130 deletions(-)
 create mode 100644 src/Core/include/utilities.h
 create mode 100644 src/Core/utilities.c

diff --git a/Wrappers/Python/ccpi/framework/BlockDataContainer.py b/Wrappers/Python/ccpi/framework/BlockDataContainer.py
index 22cee03..a0d139b 100644
--- a/Wrappers/Python/ccpi/framework/BlockDataContainer.py
+++ b/Wrappers/Python/ccpi/framework/BlockDataContainer.py
@@ -23,6 +23,7 @@ import numpy
 from numbers import Number
 import functools
 from ccpi.framework import DataContainer
+from ccpi.utilities import NUM_THREADS
 #from ccpi.framework import AcquisitionData, ImageData
 #from ccpi.optimisation.operators import Operator, LinearOperator
 
@@ -50,11 +51,12 @@ class BlockDataContainer(object):
     A * 3 = [ 3 * [B,C] , 3* D] = [ [ 3*B, 3*C]  , 3*D ]
     
     '''
-    ADD = 'add'
+    ADD      = 'add'
     SUBTRACT = 'subtract'
     MULTIPLY = 'multiply'
-    DIVIDE = 'divide'
-    POWER = 'power'
+    DIVIDE   = 'divide'
+    POWER    = 'power'
+    AXPBY    = 'axpby'
     __array_priority__ = 1
     __container_priority__ = 2
     def __init__(self, *args, **kwargs):
@@ -173,7 +175,22 @@ class BlockDataContainer(object):
             self.binary_operations(BlockDataContainer.DIVIDE, other, *args, **kwargs)
         else:
             return self.binary_operations(BlockDataContainer.DIVIDE, other, *args, **kwargs)
-    
+    def axpby(self, a, b, y, out, dtype=numpy.float32, num_threads = NUM_THREADS):
+        r'''performs axpby element-wise on the BlockDataContainer containers
+        
+        Does the operation .. math:: a*x+b*y and stores the result in out, where x is self
+
+        :param a: scalar
+        :param b: scalar
+        :param y: compatible (Block)DataContainer
+        :param out: (Block)DataContainer to store the result
+        :param dtype: optional, data type of the DataContainers
+        '''
+        if out is None:
+            raise ValueError("out container cannot be None")
+        kwargs = {'a':a, 'b':b, 'out':out, 'dtype': dtype, 'num_threads': NUM_THREADS}
+        self.binary_operations(BlockDataContainer.AXPBY, y, **kwargs)
+
 
     def binary_operations(self, operation, other, *args, **kwargs):
         '''Algebra: generic method of algebric operation with BlockDataContainer with number/DataContainer or BlockDataContainer
@@ -234,11 +251,19 @@ class BlockDataContainer(object):
                     op = el.divide
                 elif operation == BlockDataContainer.POWER:
                     op = el.power
+                elif operation == BlockDataContainer.AXPBY:
+                    if not isinstance(other, BlockDataContainer):
+                        raise ValueError("{} cannot handle {}".format(operation, type(other)))
+                    op = el.axpby
                 else:
                     raise ValueError('Unsupported operation', operation)
                 if out is not None:
                     kw['out'] = out.get_item(i)
-                    op(ot, *args, **kw)
+                    if operation == BlockDataContainer.AXPBY:
+                        kw['y'] = ot
+                        el.axpby(kw['a'], kw['b'], kw['y'], kw['out'], kw['dtype'], kw['num_threads'])
+                    else:
+                        op(ot, *args, **kw)
                 else:
                     res.append(op(ot, *args, **kw))
             if out is not None:
@@ -249,6 +274,12 @@ class BlockDataContainer(object):
         else:
             # try to do algebra with one DataContainer. Will raise error if not compatible
             kw = kwargs.copy()
+            if operation != BlockDataContainer.AXPBY:
+                # remove keyworded argument related to AXPBY
+                for k in ['a','b','y', 'num_threads', 'dtype']:
+                    if k in kw.keys():
+                        kw.pop(k)
+                
             res = []
             for i,el in enumerate(self.containers):
                 if operation == BlockDataContainer.ADD:
@@ -261,6 +292,12 @@ class BlockDataContainer(object):
                     op = el.divide
                 elif operation == BlockDataContainer.POWER:
                     op = el.power
+                elif operation == BlockDataContainer.AXPBY:
+                    # As out cannot be None, it is safe to continue the 
+                    # for loop after the call to axpby
+                    kw['out'] = out.get_item(i)
+                    el.axpby(kw['a'], kw['b'], other, kw['out'], kw['dtype'], kw['num_threads'])
+                    continue
                 else:
                     raise ValueError('Unsupported operation', operation)
                 if out is not None:
diff --git a/Wrappers/Python/ccpi/framework/framework.py b/Wrappers/Python/ccpi/framework/framework.py
index 65121d2..6f1ed1c 100644
--- a/Wrappers/Python/ccpi/framework/framework.py
+++ b/Wrappers/Python/ccpi/framework/framework.py
@@ -26,6 +26,7 @@ import warnings
 from functools import reduce
 from numbers import Number
 import ctypes, platform
+from ccpi.utilities import NUM_THREADS
 
 # dll = os.path.abspath(os.path.join( 
 #          os.path.abspath(os.path.dirname(__file__)),
@@ -45,6 +46,11 @@ else:
 #print ("dll location", dll)
 cilacc = ctypes.cdll.LoadLibrary(dll)
 
+#default nThreads
+# import multiprocessing
+# cpus = multiprocessing.cpu_count()
+# NUM_THREADS = max(int(cpus/2),1)
+
 
 def find_key(dic, val):
     """return the key of dictionary dic given the value"""
@@ -828,24 +834,27 @@ class DataContainer(object):
     def minimum(self,x2, out=None, *args, **kwargs):
         return self.pixel_wise_binary(numpy.minimum, x2=x2, out=out, *args, **kwargs)
 
-    @staticmethod
-    def axpby(a,x,b,y,out,dtype=numpy.float32):
+    def axpby(self, a, b, y, out, dtype=numpy.float32, num_threads=NUM_THREADS):
         '''performs axpby with cilacc C library
         
-        Does the operation .. math:: a*x+b*y and stores the result in out
+        Does the operation .. math:: a*x+b*y and stores the result in out, where x is self
 
         :param a: scalar
-        :param x: DataContainer
+        :type a: float
         :param b: scalar
+        :type b: float
         :param y: DataContainer
-        :param out: DataContainer to store the result
-        :param dtype: optional, data type of the DataContainers
+        :param out: DataContainer instance to store the result
+        :param dtype: data type of the DataContainers
+        :type dtype: numpy type, optional, default numpy.float32
+        :param num_threads: number of threads to run on
+        :type num_threads: int, optional, default 1/2 CPU of the system
         '''
 
         c_float_p = ctypes.POINTER(ctypes.c_float)
         c_double_p = ctypes.POINTER(ctypes.c_double)
         # get the reference to the data
-        ndx = x.as_array()
+        ndx = self.as_array()
         ndy = y.as_array()
         ndout = out.as_array()
 
@@ -879,15 +888,17 @@ class DataContainer(object):
                                   ctypes.POINTER(ctypes.c_float),  # pointer to the third array 
                                   ctypes.c_float,                  # type of A (float)
                                   ctypes.c_float,                  # type of B (float)
-                                  ctypes.c_long]                   # type of size of first array 
+                                  ctypes.c_long,                   # type of size of first array 
+                                  ctypes.c_int]                    # number of threads
         cilacc.daxpby.argtypes = [ctypes.POINTER(ctypes.c_double), # pointer to the first array 
                                   ctypes.POINTER(ctypes.c_double), # pointer to the second array 
                                   ctypes.POINTER(ctypes.c_double), # pointer to the third array 
                                   ctypes.c_double,                 # type of A (c_double)
                                   ctypes.c_double,                 # type of B (c_double)
-                                  ctypes.c_long]                   # type of size of first array 
+                                  ctypes.c_long,                   # type of size of first array 
+                                  ctypes.c_int]                    # number of threads
 
-        if f(x_p, y_p, out_p, a, b, ndx.size) != 0:
+        if f(x_p, y_p, out_p, a, b, ndx.size, num_threads) != 0:
             raise RuntimeError('axpby execution failed')
         
 
diff --git a/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py b/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py
index cc384e3..dcb9298 100644
--- a/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py
+++ b/Wrappers/Python/ccpi/optimisation/algorithms/PDHG.py
@@ -130,24 +130,26 @@ class PDHG(Algorithm):
 
         # Gradient ascent for the dual variable
         self.operator.direct(self.xbar, out=self.y_tmp)
-        self.y_tmp *= self.sigma
-        self.y_tmp += self.y_old
+        # self.y_tmp *= self.sigma
+        # self.y_tmp += self.y_old
+        self.y_tmp.axpby(self.sigma, 1 , self.y_old, self.y_tmp)
 
         # self.y = self.f.proximal_conjugate(self.y_old, self.sigma)
         self.f.proximal_conjugate(self.y_tmp, self.sigma, out=self.y)
         
         # Gradient descent for the primal variable
         self.operator.adjoint(self.y, out=self.x_tmp)
-        self.x_tmp *= -1*self.tau
-        self.x_tmp += self.x_old
+        # self.x_tmp *= -1*self.tau
+        # self.x_tmp += self.x_old
+        self.x_tmp.axpby(-self.tau, 1. , self.x_old, self.x_tmp)
 
         self.g.proximal(self.x_tmp, self.tau, out=self.x)
 
         # Update
         self.x.subtract(self.x_old, out=self.xbar)
-        self.xbar *= self.theta
-        self.xbar += self.x
-
+        # self.xbar *= self.theta
+        # self.xbar += self.x
+        self.xbar.axpby(self.theta, 1 , self.x, self.xbar)
         
         
     def update_objective(self):
diff --git a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
index a45c3d2..a5feca3 100644
--- a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
+++ b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
@@ -22,13 +22,14 @@ from __future__ import print_function
 from ccpi.optimisation.operators import Operator, LinearOperator, ScaledOperator
 from ccpi.optimisation.operators import FiniteDiff, SparseFiniteDiff
 from ccpi.framework import ImageData, ImageGeometry, BlockGeometry, BlockDataContainer
+from ccpi.utilities import NUM_THREADS
 import numpy 
 import warnings
 
 #default nThreads
-import multiprocessing
-cpus = multiprocessing.cpu_count()
-NUM_THREADS = max(int(cpus/2),1)
+# import multiprocessing
+# cpus = multiprocessing.cpu_count()
+# NUM_THREADS = max(int(cpus/2),1)
 
 NEUMANN = 'Neumann'
 PERIODIC = 'Periodic'
diff --git a/Wrappers/Python/ccpi/utilities/__init__.py b/Wrappers/Python/ccpi/utilities/__init__.py
index e69de29..79eaa98 100644
--- a/Wrappers/Python/ccpi/utilities/__init__.py
+++ b/Wrappers/Python/ccpi/utilities/__init__.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+#   This work is part of the Core Imaging Library developed by
+#   Visual Analytics and Imaging System Group of the Science Technology
+#   Facilities Council, STFC
+
+#   Copyright 2020 Edoardo Pasca
+
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+
+#       http://www.apache.org/licenses/LICENSE-2.0
+
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#default nThreads
+
+import multiprocessing
+NUM_THREADS = max(int(multiprocessing.cpu_count()/2),1)
diff --git a/Wrappers/Python/test/test_BlockDataContainer.py b/Wrappers/Python/test/test_BlockDataContainer.py
index bc0e83a..a8e59b0 100755
--- a/Wrappers/Python/test/test_BlockDataContainer.py
+++ b/Wrappers/Python/test/test_BlockDataContainer.py
@@ -485,4 +485,159 @@ class TestBlockDataContainer(unittest.TestCase):
             print(err)
         self.assertTrue(res)    
 
+    def test_axpby(self):
+        # test axpby between BlockDataContainers
+        ig0 = ImageGeometry(2,3,4)
+        ig1 = ImageGeometry(2,3,5)
+        
+        data0 = ig0.allocate(-1)
+        data2 = ig0.allocate(1)
+
+        data1 = ig0.allocate(2)
+        data3 = ig0.allocate(3)
+        
+        cp0 = BlockDataContainer(data0,data2)
+        cp1 = BlockDataContainer(data1,data3)
+        
+        out = cp0 * 0. - 10
+
+        cp0.axpby(3,-2,cp1,out, num_threads=4)
+
+        # operation should be [  3 * -1 + (-2) * 2 , 3 * 1 + (-2) * 3 ] 
+        # output should be [ -7 , -3 ]
+        res0 = ig0.allocate(-7)
+        res2 = ig0.allocate(-3)
+        res = BlockDataContainer(res0, res2)
+
+        print ("res0", res0.as_array())
+        print ("res2", res2.as_array())
+
+        print ("###############################")
+        
+        print ("out_0", out.get_item(0).as_array())
+        print ("out_1", out.get_item(1).as_array())
+        self.assertBlockDataContainerEqual(out, res)
+
+    def test_axpby2(self):
+        # test axpby with BlockDataContainer and DataContainer
+        ig0 = ImageGeometry(2,3,4)
+        # ig1 = ImageGeometry(2,3,5)
+        
+        data0 = ig0.allocate(-1)
+        data2 = ig0.allocate(1)
+
+        data1 = ig0.allocate(2)
+        # data3 = ig1.allocate(3)
+        
+        cp0 = BlockDataContainer(data0,data2)
+        # cp1 = BlockDataContainer(data1,data3)
+        
+        out = cp0 * 0. - 10
+
+        cp0.axpby(3,-2,data1,out)
+
+        # operation should be [  3 * -1 + (-2) * 2 , 3 * 1 + (-2) * 2 ] 
+        # output should be [ -7 , -1 ]
+        res0 = ig0.allocate(-7)
+        res2 = ig0.allocate(-1)
+        res = BlockDataContainer(res0, res2)
+
+        print ("res0", res0.as_array())
+        print ("res2", res2.as_array())
+
+        print ("###############################")
+        
+        print ("out_0", out.get_item(0).as_array())
+        print ("out_1", out.get_item(1).as_array())
+        self.assertBlockDataContainerEqual(out, res)
+
+
+    def test_axpby3(self):
+        # test axpby with nested BlockDataContainer
+        ig0 = ImageGeometry(2,3,4)
+        ig1 = ImageGeometry(2,3,5)
+        
+        data0 = ig0.allocate(-1)
+        data2 = ig0.allocate(1)
+
+        # data1 = ig0.allocate(2)
+        data3 = ig1.allocate(3)
+        
+        cp0 = BlockDataContainer(data0,data2)
+        cp1 = BlockDataContainer(cp0 *0. +  [2, -2], data3)
+        print (cp1.get_item(0).get_item(0).as_array())
+        print (cp1.get_item(0).get_item(1).as_array())
+        print (cp1.get_item(1).as_array())
+        print ("###############################")
+        
+
+
+        out = cp1 * 0. 
+        cp2 = out + [1,3]
+
 
+        print (cp2.get_item(0).get_item(0).as_array())
+        print (cp2.get_item(0).get_item(1).as_array())
+        print (cp2.get_item(1).as_array())
+
+        cp2.axpby(3,-2, cp1 ,out)
+
+        # output should be [ [ -1 , 7 ] , 3]
+        res0 = ig0.allocate(-1)
+        res2 = ig0.allocate(7)
+        res3 = ig1.allocate(3)
+        res = BlockDataContainer(BlockDataContainer(res0, res2), res3)
+
+        # print ("res0", res0.as_array())
+        # print ("res2", res2.as_array())
+
+        print ("###############################")
+        
+        # print ("out_0", out.get_item(0).as_array())
+        # print ("out_1", out.get_item(1).as_array())
+        self.assertBlockDataContainerEqual(out, res)
+
+    def test_axpby4(self):
+        # test axpby with nested BlockDataContainer
+        ig0 = ImageGeometry(2,3,4)
+        ig1 = ImageGeometry(2,3,5)
+        
+        data0 = ig0.allocate(-1)
+        data2 = ig0.allocate(1)
+
+        # data1 = ig0.allocate(2)
+        data3 = ig1.allocate(3)
+        
+        cp0 = BlockDataContainer(data0,data2)
+        cp1 = BlockDataContainer(cp0 *0. +  [2, -2], data3)
+        print (cp1.get_item(0).get_item(0).as_array())
+        print (cp1.get_item(0).get_item(1).as_array())
+        print (cp1.get_item(1).as_array())
+        print ("###############################")
+        
+
+
+        out = cp1 * 0. 
+        cp2 = out + [1,3]
+
+
+        print (cp2.get_item(0).get_item(0).as_array())
+        print (cp2.get_item(0).get_item(1).as_array())
+        print (cp2.get_item(1).as_array())
+
+        cp2.axpby(3,-2, cp1 ,out, num_threads=4)
+
+        # output should be [ [ -1 , 7 ] , 3]
+        res0 = ig0.allocate(-1)
+        res2 = ig0.allocate(7)
+        res3 = ig1.allocate(3)
+        res = BlockDataContainer(BlockDataContainer(res0, res2), res3)
+
+        # print ("res0", res0.as_array())
+        # print ("res2", res2.as_array())
+
+        print ("###############################")
+        
+        # print ("out_0", out.get_item(0).as_array())
+        # print ("out_1", out.get_item(1).as_array())
+        self.assertBlockDataContainerEqual(out, res)
diff --git a/Wrappers/Python/test/test_DataContainer.py b/Wrappers/Python/test/test_DataContainer.py
index 6e297ee..4a8a6d1 100755
--- a/Wrappers/Python/test/test_DataContainer.py
+++ b/Wrappers/Python/test/test_DataContainer.py
@@ -740,7 +740,20 @@ class TestDataContainer(unittest.TestCase):
         d2 = ig.allocate(2)                                                     
         out = ig.allocate(None)                                                 
         # equals to 2 * [1] + 1 * [2] = [4]
-        DataContainer.axpby(2,d1,1,d2,out)
+        d1.axpby(2,1,d2,out)
+        res = numpy.ones_like(d1.as_array()) * 4.
+        numpy.testing.assert_array_equal(res, out.as_array())
+    def test_axpby2(self):
+        print ("test axpby2")
+        N = 100
+        ig = ImageGeometry(N,2*N,N*10)                                               
+        d1 = ig.allocate(1)                                                     
+        d2 = ig.allocate(2)                                                     
+        out = ig.allocate(None)   
+        print ("allocated")                                              
+        # equals to 2 * [1] + 1 * [2] = [4]
+        d1.axpby(2,1,d2,out, num_threads=4)
+        print ("calculated") 
         res = numpy.ones_like(d1.as_array()) * 4.
         numpy.testing.assert_array_equal(res, out.as_array())
 
diff --git a/Wrappers/Python/test/test_Gradient.py b/Wrappers/Python/test/test_Gradient.py
index 5aeede0..78fc261 100755
--- a/Wrappers/Python/test/test_Gradient.py
+++ b/Wrappers/Python/test/test_Gradient.py
@@ -30,6 +30,8 @@ class TestGradient(unittest.TestCase):
         N, M, K = 20, 30, 40
         channels = 10
         
+        numpy.random.seed(1)
+        
         # check range geometry, examples
         
         ig1 = ImageGeometry(voxel_num_x = M, voxel_num_y = N) 
@@ -235,4 +237,4 @@ class TestGradient(unittest.TestCase):
 
         grad = Gradient(ig, bnd_cond='Periodic', correlation='SpaceChannels', backend='numpy')
         self.assertTrue(LinearOperator.dot_test(grad))
-        
\ No newline at end of file
+        
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index e828fe5..9c9a89d 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -93,8 +93,9 @@ message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
 
 
 
-add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c
-                          ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c )
+add_library(cilacc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/utilities.c
+                          ${CMAKE_CURRENT_SOURCE_DIR}/FiniteDifferenceLibrary.c
+                          ${CMAKE_CURRENT_SOURCE_DIR}/axpby.c )
 
 target_link_libraries(cilacc ${OpenMP_C_LIB_NAMES} )
 include_directories(cilacc PUBLIC 
diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c
index fbf2646..244e170 100644
--- a/src/Core/FiniteDifferenceLibrary.c
+++ b/src/Core/FiniteDifferenceLibrary.c
@@ -16,17 +16,6 @@ DLL_EXPORT int openMPtest(int nThreads)
 	}
 	return nThreads_running;
 }
-void threads_setup(int nThreads_requested, int *nThreads_current)
-{
-#pragma omp parallel
-	{
-		if (omp_get_thread_num() == 0)
-		{
-			*nThreads_current = omp_get_num_threads();
-		}
-	}
-	omp_set_num_threads(nThreads_requested);
-}
 
 int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
 {
diff --git a/src/Core/axpby.c b/src/Core/axpby.c
index c4d162d..54a597f 100755
--- a/src/Core/axpby.c
+++ b/src/Core/axpby.c
@@ -1,87 +1,12 @@
 #include "axpby.h"
 
 
-DLL_EXPORT int padd(float * x, float * y, float * out, long size){
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size, int nThreads){
     long i = 0;
-#pragma omp parallel for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = *(x + i) + *(y+i); 
-    }
-    return 0;
-}
-
-DLL_EXPORT int psubtract(float * x, float * y, float * out, long size){
-    long i = 0;
-#pragma omp parallel
-{
-//#pragma omp single
-//{
-//		printf("current number of threads %d\n", omp_get_num_threads());
-//}
-#pragma omp for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = *(x + i) - *(y+i); 
-    }
-}
-    return 0;
-    
-}
-
-DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size){
-    long i = 0;
-#pragma omp parallel for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = *(x + i) * *(y+i); 
-    }
-    return 0;
-}
-
-DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value)
-{
-    long i = 0;
-#pragma omp parallel for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = *(y+i) ? *(x + i) / *(y+i) : default_value;
-    }
-    return 0;
-}
-DLL_EXPORT int ppower(float * x, float * y, float * out, long size){
-    long i = 0;
-#pragma omp parallel for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = (float)pow(*(x + i) , *(y+i)) ; 
-    }
-    return 0;
-}
-
-DLL_EXPORT int pminimum(float * x, float * y, float * out, long size){
-    long i = 0;
-#pragma omp parallel for
-    for (i=0; i < size; i++)
-    {
-        *(out + i ) = *(y+i) > (*x+i) ? *(x + i) :  *(y+i); 
-    }
-    return 0;
-}
-
-DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size) {
-	long i = 0;
-#pragma omp parallel for
-	for (i = 0; i < size; i++)
-	{
-		*(out + i) = *(y + i) < (*x + i) ? *(x + i) : *(y + i);
-	}
-	return 0;
-}
 
+    int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
 
-DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size){
-    long i = 0;
 #pragma omp parallel
 {
 #pragma omp for
@@ -90,11 +15,12 @@ DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long
         *(out + i ) = a * ( *(x + i) ) + b * ( *(y + i) ); 
     }
 }
+    omp_set_num_threads(nThreads_initial);
     return 0;
     
 }
 
-DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size) {
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size, int nThreads) {
 	long i = 0;
 #pragma omp parallel
 	{
diff --git a/src/Core/include/FiniteDifferenceLibrary.h b/src/Core/include/FiniteDifferenceLibrary.h
index 6e426af..b8e6c4f 100644
--- a/src/Core/include/FiniteDifferenceLibrary.h
+++ b/src/Core/include/FiniteDifferenceLibrary.h
@@ -3,4 +3,5 @@
 #include <stdio.h>
 #include "omp.h"
 //#include "ipp.h"
-#include "dll_export.h"
\ No newline at end of file
+#include "dll_export.h"
+#include "utilities.h"
\ No newline at end of file
diff --git a/src/Core/include/axpby.h b/src/Core/include/axpby.h
index 2849547..e13d6e1 100644
--- a/src/Core/include/axpby.h
+++ b/src/Core/include/axpby.h
@@ -3,15 +3,8 @@
 #include <stdio.h>
 #include "omp.h"
 #include "dll_export.h"
+#include "utilities.h"
 
 
-DLL_EXPORT int padd(float * x, float * y, float * out, long size);
-DLL_EXPORT int psubtract(float * x, float * y, float * out, long size);
-DLL_EXPORT int pmultiply(float * x, float * y, float * out, long size);
-DLL_EXPORT int pdivide(float * x, float * y, float * out, long size, float default_value);
-DLL_EXPORT int ppower(float * x, float * y, float * out, long size);
-DLL_EXPORT int pminimum(float * x, float * y, float * out, long size);
-DLL_EXPORT int pmaximum(float * x, float * y, float * out, long size);
-
-DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size);
-DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size);
+DLL_EXPORT int saxpby(float * x, float * y, float * out, float a, float b, long size, int nThreads);
+DLL_EXPORT int daxpby(double * x, double * y, double * out, double a, double b, long size, int nThreads);
diff --git a/src/Core/include/utilities.h b/src/Core/include/utilities.h
new file mode 100644
index 0000000..c3003d6
--- /dev/null
+++ b/src/Core/include/utilities.h
@@ -0,0 +1,3 @@
+#include "omp.h"
+
+void threads_setup(int nThreads_requested, int *nThreads_current);
\ No newline at end of file
diff --git a/src/Core/utilities.c b/src/Core/utilities.c
new file mode 100644
index 0000000..86b23e8
--- /dev/null
+++ b/src/Core/utilities.c
@@ -0,0 +1,14 @@
+#include "utilities.h"
+
+
+void threads_setup(int nThreads_requested, int *nThreads_current)
+{
+#pragma omp parallel
+	{
+		if (omp_get_thread_num() == 0)
+		{
+			*nThreads_current = omp_get_num_threads();
+		}
+	}
+	omp_set_num_threads(nThreads_requested);
+}
-- 
cgit v1.2.1