[vspline] 23/72: added code for evaluation of mesh grids to remap.h

Sun Jul 2 09:02:39 UTC 2017

This is an automated email from the git hooks/post-receive script.

kfj-guest pushed a commit to branch master
in repository vspline.

commit 1e263420474aed458919fce03d627393f9565910
Author: Kay F. Jahnke <kfjahnke at gmail.com>
Date:   Sun Dec 25 12:22:31 2016 +0100

    added code for evaluation of mesh grids to remap.h
---
 eval.h        | 152 ++++++++++++++++++++++++++++++--
 remap.h       | 273 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 thread_pool.h |  12 +--
 3 files changed, 426 insertions(+), 11 deletions(-)

diff --git a/eval.h b/eval.h
index 317d7c1..5d5b044 100644
--- a/eval.h
+++ b/eval.h
@@ -604,6 +604,16 @@ private:
 
 public:
 
+  const int & get_order() const
+  {
+    return ORDER ;
+  }
+
+  const shape_type & get_stride() const
+  {
+    return coefficients.stride() ;
+  }
+
   /// this constructor is the most flexible variant and will ultimately be called by all other
   /// constructor overloads. class evaluator is coded to be (thread)safe as a functor: all state
   /// (in terms of member data) is fixed at construction time and remains constant afterwards.
@@ -749,6 +759,16 @@ public:
       (*(fweight[axis])) ( weight.data() + axis * ORDER , *ci ) ;
   }
 
+  /// obtain weight for a single axis
+
+  template < typename rc_type , typename weight_type >
+  void obtain_weights ( weight_type * p_weight ,
+                        const int & axis ,
+                        const rc_type& c ) const
+  {
+    (*(fweight[axis])) ( p_weight , c ) ;
+  }
+
   /// _eval is the workhorse routine and implements the recursive arithmetic needed to
   /// evaluate the spline. First the weights for the current dimension are obtained
   /// from the weights object passed in. Once the weights are known, they are successively
@@ -817,11 +837,59 @@ public:
     }
   } ;
 
+  /// _xeval takes an ele_type** 'weight', pointing to level ele_type*, which each
+  /// point to ORDER weights. This variant is used when the per-axis components
+  /// of the weights appear repeatedly,like in mesh grid processing.
+  // TODO it has to be seen if this is actually faster than just copying together
+  // the weights into the 2D MultiArrayView used by the routines above
+  // ... This is hard to vectorize and introduces lots of new code, so I'm not
+  // using it, but it's left in for now
+
+  // tentatively I coded:
+  
+//   template < int level , class dtype >
+//   struct _xeval
+//   {
+//     dtype operator() ( const dtype* & pdata ,
+//                        offset_iterator & ofs ,
+//                        const ele_type** const weight ,
+//                        const int & ORDER
+//                      ) const
+//     {
+//       dtype result = dtype() ;
+//       for ( int i = 0 ; i < ORDER ; i++ )
+//       {
+//         result +=   weight [ level ] [ i ]
+//                   * _eval < level - 1 , dtype >() ( pdata , ofs , weight ) ;
+//       }
+//       return result ;
+//     }
+//   } ;
+// 
+//   template < class dtype >
+//   struct _xeval < 0 , dtype >
+//   {
+//     dtype operator() ( const dtype* & pdata ,
+//                        offset_iterator & ofs ,
+//                        const ele_type** const const weight ,
+//                        const int & ORDER
+//                      ) const
+//     {
+//       dtype result = dtype() ;
+//       for ( int i = 0 ; i < ORDER ; i++ )
+//       {
+//         result += pdata [ *ofs ] * weight [ 0 ] [ i ] ;
+//         ++ofs ;
+//       }
+//       return result ;
+//     }
+//   } ;
+
   // next are evaluation routines. there are quite a few, since I've coded for operation
   // from different starting points and both for vectorized and nonvectorized operation.
   
   /// evaluation variant which takes an offset and a set of weights. This is the final delegate,
-  /// calling the recursive _eval method. Having the weights passed in via a const MultiArrayViev &
+  /// calling the recursive _eval method. Having the weights passed in via a const MultiArrayView &
   /// allows calling code to provide their own weights, together with their shape, in one handy
   /// packet. And in the normal sequence of delegation inside class eval, the next routine 'up'
   /// can also package the weights nicely in a MultiArrayView. Note that select is now an ic_type,
@@ -836,6 +904,19 @@ public:
     result = _eval<level,value_type>() ( base , ofs , weight ) ;
   }
 
+//   /// x... variant, taking component weights by adress.
+//   /// this variant is used for mesh grid processing
+//   
+//   void xeval ( const ic_type & select ,
+//                const ele_type** const const weight ,
+//                const int & ORDER ,
+//                value_type & result ) const
+//   {
+//     const value_type * base = coefficients.data() + select ;
+//     offset_iterator ofs = offsets.begin() ;
+//     result = _xeval<level,value_type>() ( base , ofs , weight , ORDER ) ;
+//   }
+
   /// 'penultimate' delegate, taking a multidimensional index 'select' to the beginning of
   /// the coefficient window to process. Here the multidimensional index is translated
   /// into an offset from the coefficient array's base adress. Carrying the information as
@@ -1092,6 +1173,55 @@ public:
     }  
   } ;
 
+//   template < class dtype , int level >
+//   struct _v_xeval
+//   {
+//     dtype operator() ( const component_base_type& base , ///< base adresses of components
+//                        const ic_v& origin ,              ///< offsets to evaluation window origins
+//                        offset_iterator & ofs ,           ///< offsets to coefficients inside this window
+//                        const ele_v ** const weight ,
+//                        const int & ORDER ) const
+//     {
+//       dtype sum = dtype() ;    ///< to accumulate the result
+//       dtype subsum ; ///< to pick up the result of the recursive call
+// 
+//       for ( int i = 0 ; i < ORDER ; i++ )
+//       {
+//         subsum = _v_xeval < dtype , level - 1 >() ( base , origin , ofs , weight , ORDER );
+//         for ( int ch = 0 ; ch < channels ; ch++ )
+//         {
+//           sum[ch] += weight [ level ] [ i ] * subsum[ch] ;
+//         }
+//       }
+//       return sum ;
+//     }  
+//   } ;
+// 
+//   /// the level 0 routine terminates the recursion
+//   
+//   template < class dtype >
+//   struct _v_xeval < dtype , 0 >
+//   {
+//     dtype operator() ( const component_base_type& base , ///< base adresses of components
+//                        const ic_v& origin ,              ///< offsets to evaluation window origins
+//                        offset_iterator & ofs ,           ///< offsets to coefficients in this window
+//                        const ele_v ** const weight ,
+//                        const int & ORDER ) const         ///< weights to apply
+//     {
+//       dtype sum = dtype() ;
+// 
+//       for ( int i = 0 ; i < ORDER ; i++ )
+//       {
+//         for ( int ch = 0 ; ch < channels ; ch++ )
+//         {
+//           sum[ch] += weight [ 0 ] [ i ] * ele_v ( base[ch] , origin + *ofs ) ;
+//         }
+//         ++ofs ;
+//       }
+//       return sum ;
+//     }  
+//   } ;
+
   // vectorized variants of the evaluation routines:
   
   /// this is the vectorized version of the final delegate, calling _v_eval. The first
@@ -1114,9 +1244,21 @@ public:
     result = _v_eval < mc_ele_v , level >() ( component_base , select , ofs , weight ) ;
   }
 
-  /// in the penultimate delegate, we calculate the weights. We also use this
-  /// method 'further down' when we operate on 'compact split coordinates',
-  /// which contain offsets and fractional parts.
+//   /// x... variant, taking componnt weights by adress
+//   /// used for mesh grid processing
+//   
+//   void v_xeval ( const ic_v& select ,
+//                  const ele_v ** const weight ,
+//                  const int & ORDER ,
+//                  mc_ele_v & result ) const
+//   {
+//     offset_iterator ofs = component_offsets.begin() ;
+//     result = _v_xeval < mc_ele_v , level >() ( component_base , select , ofs , weight , ORDER ) ;
+//   }
+// 
+//   /// in the penultimate delegate, we calculate the weights. We also use this
+//   /// method 'further down' when we operate on 'compact split coordinates',
+//   /// which contain offsets and fractional parts.
 
   void v_eval ( const ic_v& select ,       // offsets to coefficient windows
                 const nd_rc_v& tune ,      // fractional parts of the coordinates
@@ -1191,7 +1333,7 @@ public:
   /// data are strided. Anyway, it doesn't hurt to have the option.
   
   void v_eval ( const split_coordinate_type * const psc , // pointer to vsize split coordinates
-                value_type * result )  const             // pointer to vsize result values
+                value_type * result )  const              // pointer to vsize result values
   {
     nd_ic_v select ;
     nd_rc_v tune ;
diff --git a/remap.h b/remap.h
index 2f04dbe..2cf7565 100644
--- a/remap.h
+++ b/remap.h
@@ -1202,6 +1202,279 @@ void make_warp_array ( transformation < rc_type ,
   
 }
 
+// in grid_weight, for every dimension we have a set of ORDER weights
+// for every position in this dimension. in grid_ofs, we have the
+// partial offset for this dimension for every position. these partial
+// offsets are the product of the index for this dimension at the position
+// and the stride for this dimension, so that the sum of the partial
+// offsets for all dimensions yields the offset into the coefficient array
+// to the window of coefficients where the weights are to be applied.
+
+template < typename interpolator_type , // type offering eval()
+           typename target_type ,       // iterates over target array
+           typename weight_type ,       // singular weight data type
+           int level >                  // current axis
+struct _grid_eval
+{
+  void operator() ( int initial_ofs ,
+                    MultiArrayView < 2 , weight_type > & weight ,
+                    weight_type** const & grid_weight ,
+                    const int & ORDER ,
+                    int ** const & grid_ofs ,
+                    const interpolator_type & itp ,
+                    target_type & result )
+  {
+    for ( int ofs = 0 ; ofs < result.shape ( level ) ; ofs++ )
+    {
+      for ( int e = 0 ; e < ORDER ; e++ )
+        weight [ vigra::Shape2 ( e , level ) ] = grid_weight [ level ] [ ORDER * ofs + e ] ;
+      int cum_ofs = initial_ofs + grid_ofs [ level ] [ ofs ] ;
+      auto region = result.bindAt ( level , ofs ) ;
+      _grid_eval < interpolator_type , decltype ( region ) , weight_type , level-1 >()
+        ( cum_ofs , weight , grid_weight , ORDER , grid_ofs , itp , region ) ;
+    }
+  }
+} ;
+
+template < typename interpolator_type ,
+           typename target_type ,
+           typename weight_type >
+struct _grid_eval < interpolator_type ,
+                    target_type ,
+                    weight_type ,
+                    0 >
+{
+  void operator() ( int initial_ofs ,
+                    MultiArrayView < 2 , weight_type > & weight ,
+                    weight_type** const & grid_weight ,
+                    const int & ORDER ,
+                    int ** const & grid_ofs ,
+                    const interpolator_type & itp ,
+                    target_type & region )
+  {
+    auto iter = region.begin() ;    
+    int ofs_start = 0 ;
+    
+#ifdef USE_VC
+  
+    // on my system, using clang++, the vectorized code is slightly slower
+    // than the unvectorized code. With g++, the vectorized code is faster
+    // than either clang version, but the unvectorized code is much slower.
+
+    const int vsize = interpolator_type::vsize ;
+    const int channels = interpolator_type::channels ;
+    typedef typename interpolator_type::value_type value_type ;
+    typedef typename interpolator_type::ele_type ele_type ;
+    typedef typename interpolator_type::ic_v ic_v ;
+    typedef typename interpolator_type::ele_v ele_v ;
+    typedef typename interpolator_type::mc_ele_v mc_ele_v ;
+
+    // number of vectorized results
+    int aggregates = region.size() / vsize ;
+    // vectorized weights
+    MultiArray < 2 , ele_v > vweight ( weight.shape() ) ;
+    // vectorized offset
+    ic_v select ;
+    // buffer for target data
+    mc_ele_v vtarget ;
+
+    // initialize the vectorized weights for dimensions > 0
+    for ( int d = 1 ; d < weight.shape(1) ; d++ )
+    {
+      for ( int o = 0 ; o < ORDER ; o++ )
+        vweight [ vigra::Shape2 ( o , d ) ] = weight [ vigra::Shape2 ( o , d ) ] ;
+    }
+
+    // get a pointer to the target array's data (seen as elementary type)
+    ele_type * p_target = (ele_type*) ( region.data() ) ;
+    // and the stride, if any, also in terms of the elementary type, from
+    // one cluster of target data to the next
+    int stride = vsize * channels * region.stride(0) ;
+
+    for ( int a = 0 ; a < aggregates ; a++ )
+    {
+      // gather the individual weights into the vectorized form
+      for ( int o = 0 ; o < ORDER ; o++ )
+      {
+        vweight[ vigra::Shape2 ( o , 0 ) ].gather
+          ( grid_weight [ 0 ] + ORDER * a * vsize ,
+            ORDER * ic_v::IndexesFromZero() + o ) ;
+      }
+      select.load ( grid_ofs [ 0 ] + a * vsize ) ; // get the offsets from grid_ofs
+      select += initial_ofs ; // add cumulated offsets from higher dimensions
+      select *= channels ;    // offsets are in terms of value_type, expand
+      
+      // now we can call the vectorized eval routine
+      itp.v_eval ( select , vweight , vtarget ) ;
+      
+      // finally we scatter the vectorized result to target memory
+      for ( int ch = 0 ; ch < channels ; ch++ )
+        vtarget[ch].scatter ( p_target + ch ,
+                              channels * ic_v::IndexesFromZero() ) ;
+
+      // and set p_target to the next cluster of target values
+      p_target += stride ;
+    }
+    
+    // adapt the iterator into target array
+    iter += aggregates * vsize ;
+    // and the initial offset
+    ofs_start += aggregates * vsize ;
+
+#endif
+    
+    // if Vc wasn't used, we start with ofs = 0 and this loop
+    // does all the processing:
+    for ( int ofs = ofs_start ; ofs < region.shape ( 0 ) ; ofs++ )
+    {
+      for ( int e = 0 ; e < ORDER ; e++ )
+        weight [ vigra::Shape2 ( e , 0 )  ] = grid_weight [ 0 ] [ ORDER * ofs + e ] ;
+      int cum_ofs = initial_ofs + grid_ofs [ 0 ] [ ofs ] ;
+      // TODO: here's the place to vectorize: gather weights into ORDER vectors
+      // of weights, have a target buffer, call v_eval
+      itp.eval ( cum_ofs , weight , *iter ) ;
+      ++iter ;
+    }
+  }
+} ;
+  
+// Here is the single-threaded code for the grid_eval function.
+// The first argument is a shape range, defining the subsets of data
+// to process in a single thread. the remainder are forwards of the
+// arguments to grid_eval, partly as pointers. The call is affected
+// via 'multithread()' which sets up the partitioning and distribution
+// to threads from a thread pool.
+
+template < typename interpolator_type , // type offering eval()
+           typename target_type ,       // type of target MultiArrayView
+           typename weight_type ,       // singular weight data type
+           typename rc_type >           // singular real coordinate
+void st_grid_eval ( shape_range_type < target_type::actual_dimension > range ,
+                    rc_type ** const _grid_coordinate ,
+                    const interpolator_type * itp ,
+                    target_type * p_result )
+{
+  const int ORDER = itp->get_order() ;
+  const int dim_target = target_type::actual_dimension ;
+  
+  // pick the subarray of the 'whole' target array pertaining to this thread's range
+  auto result = p_result->subarray ( range[0] , range[1] ) ;
+  
+  // pick the subset of coordinates pertaining to this thread's range
+  const rc_type * grid_coordinate [ dim_target ] ;
+  for ( int d = 0 ; d < dim_target ; d++ )
+    grid_coordinate[d] = _grid_coordinate[d] + range[0][d] ;
+
+  // set up storage for precalculated weights and offsets
+
+  weight_type * grid_weight [ dim_target ] ;
+  int * grid_ofs [ dim_target ] ;
+  
+  // get some metrics
+  TinyVector < int , dim_target > shape ( result.shape() ) ;
+  TinyVector < int , dim_target > stride ( itp->get_stride() ) ;
+  
+  // allocate space for the per-axis weights and offsets
+  for ( int d = 0 ; d < dim_target ; d++ )
+  {
+    grid_weight[d] = new weight_type [ ORDER * shape [ d ] ] ;
+    grid_ofs[d] = new int [ shape [ d ] ] ;
+  }
+  
+  int select ;
+  rc_type tune ;
+  
+  // obtain the evaluator's mapping
+  auto mmap = itp->get_mapping() ;
+  
+  // fill in the weights and offsets, using the interplator's mapping to split
+  // the coordinates received in grid_coordinate, the interpolator's obtain_weights
+  // method to produce the weight components, and the strides of the coefficient array
+  // to convert the integral parts of the coordinates into offsets.
+  for ( int d = 0 ; d < dim_target ; d++ )
+  {
+    for ( int c = 0 ; c < shape [ d ] ; c++ )
+    {
+      mmap ( grid_coordinate [ d ] [ c ] , select , tune , d ) ; 
+      itp->obtain_weights ( grid_weight [ d ] + ORDER * c , d , tune ) ;
+      grid_ofs [ d ] [ c ] = select * stride [ d ] ;
+    }
+  }
+  
+  // allocate storage for a set of singular weights
+  MultiArray < 2 , weight_type > weight ( vigra::Shape2 ( ORDER , dim_target ) ) ;
+  
+  // now call the recursive workhorse routine
+  _grid_eval < interpolator_type ,
+               target_type ,
+               weight_type ,
+               dim_target - 1 >()
+   ( 0 ,
+     weight ,
+     grid_weight ,
+     ORDER ,
+     grid_ofs ,
+     *itp ,
+     result ) ;
+
+  // clean up
+  for ( int d = 0 ; d < dim_target ; d++ )
+  {
+    delete[] grid_weight[d] ;
+    delete[] grid_ofs[d] ;
+  }
+  
+}
+
+// this is the multithreaded version of grid_eval, which sets up the
+// full range over 'result' and calls 'multithread' to do the rest
+
+/// grid_eval evaluates a b-spline object (TODO: general interpolator obj)
+/// at points whose coordinates are distributed in a grid, so that for
+/// every axis there is a set of as many coordinates as this axis is long,
+/// which will be used in the grid as the coordinate for this axis at the
+/// corresponding position. The resulting coordinate matrix (which remains
+/// implicit) is like a mesh grid made from the per-axis coordinates.
+///
+/// If we have two dimensions and x coordinates x0, x1 and x2, and y
+/// coordinates y0 and y1, the resulting implicit coordinate matrix is
+///
+/// (x0,y0) (x1,y0) (x2,y0)
+///
+/// (x0,y1) (x1,y1) (x2,y1)
+///
+/// since the offsets and weights needed to perform an interpolation
+/// only depend on the coordinates, this highly redundant coordinate array
+/// can be processed more efficiently by precalculating the offset component
+/// and weight component for all axes and then simply permutating them to
+/// obtain the result. Especially for higher-degree and higher-dimensional
+/// splines this saves quite some time, since the generation of weights
+/// is computationally expensive.
+///
+/// grid_eval is useful for generating scaled representation of the original
+/// data, but when scaling down, aliasing will occur and the data should be
+/// low-pass-filtered adequately before processing.
+
+// TODO: I intend to implement a process where evaluation to a smaller target
+// is performed on unfiltered data with a b-spline evaluator of suitable degree,
+// resulting in a suitably filtered scaled-down version. This process is then 
+// repeated several times, resulting in a pyramid which can be used to obtain
+// antialiased interpolates at all scales.
+
+template < typename interpolator_type , // type offering eval()
+           typename target_type ,
+           typename weight_type ,       // singular weight data type
+           typename rc_type >           // singular real coordinate
+void grid_eval ( rc_type ** const grid_coordinate ,
+                 const interpolator_type & itp ,
+                 target_type & result )
+{
+  const int dim_target = target_type::actual_dimension ;
+  shape_range_type < dim_target > range ( shape_type < dim_target > () , result.shape() ) ;
+  multithread ( st_grid_eval < interpolator_type , target_type , weight_type , rc_type > ,
+                ncores , range , grid_coordinate , &itp , &result ) ;
+}
+
 } ; // end of namespace vspline
 
 #endif // VSPLINE_REMAP_H
diff --git a/thread_pool.h b/thread_pool.h
index a917def..4b4318e 100644
--- a/thread_pool.h
+++ b/thread_pool.h
@@ -43,12 +43,6 @@
 namespace vspline
 {
   
-/// code to run a worker thread
-/// We use a thread pool of worker threads. These threads have a very 
-/// simple cycle: They try and obtain a task (std::function<void()>). 
-/// If there is one to be had, it is invoked, otherwise they wait on
-/// task_cv.
-  
 class thread_pool
 {
   // used to switch off the worker threads at program termination
@@ -74,6 +68,12 @@ public:
 
 private:
   
+  /// code to run a worker thread
+  /// We use a thread pool of worker threads. These threads have a very 
+  /// simple cycle: They try and obtain a task (std::function<void()>). 
+  /// If there is one to be had, it is invoked, otherwise they wait on
+  /// task_cv.
+  
   void worker_thread()
   {
     std::function < void() > task ;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/vspline.git